Re: [R] speeding up likelihood computation

From: jim holtman <jholtman_at_gmail.com>
Date: Sun, 2 Dec 2007 20:54:13 -0500

One thing that I would suggest that you do is to use Rprof on a subset of the data that runs for 10-15 minutes and see where some of the hot spots are. Since you have not provided commented, minimal, self-contained, reproducible code, it is hard to determine where the inefficiencies are since we don't have any data to run against it. Some of the loop look like you are just assigning a value to a vector, e.g.,

> if (alive[j]==N1) {
>
> for (i in 1:(N1-1)) {
> S[N1,i] <- (q^(nb[j]))*((1-q)^(ng[j]))
> }
> }
>
> else {
> for (i in 1:(N1-1)) {
> S[N1,i] <- 0
> }
>
> }

that can be done without loops, but without data, it is hard to determine. Run Rprof and see what summary.Rprof shows to indicate where to focus on.

On Dec 2, 2007 12:49 PM, DEEPANKAR BASU <basu.15_at_osu.edu> wrote:
> R Users:
>
> I am trying to estimate a model of fertility behaviour using birth history data with maximum likelihood. My code works but is extremely slow (because of several for loops and my programming inefficiencies); when I use the genetic algorithm to optimize the likelihood function, it takes several days to complete (on a machine with Intel Core 2 processor [2.66GHz] and 2.99 GB RAM). Computing the hessian and using it to calculate the standard errors takes a large chunk of this time.
>
> I am copying the code for my likelihood function below; it would be great if someone could suggest any method to speed up the code (by avoiding the for loops or by any other method).
>
> I am not providing details of my model or what exactly I am trying to do in each step of the computation below; i would be happy to provide these details if they are deemed necessary for re-working the code.
>
> Thanks.
> Deepankar
>
>
> --------- begin code -----------------------
>
> LLK1 <- function(paramets, data.frame, ...) { # DEFINING THE LOGLIKELIHOOD FUNCTION
>
> # paramets IS A 1x27 VECTOR OF PARAMETERS OVER WHICH THE FUNCTION WILL BE MAXIMISED
> # data.frame IS A DATA FRAME. THE DATA FRAME CONTAINS OBSERVATIONS ON SEVERAL VARIABLES
> # (LIKE EDUCATION, AGE, ETC.) FOR EACH RESPONDENT. COLUMNS REFER TO VARIABLES AND ROWS REFER
> # TO OBSERVATIONS.
>
> ########## PARAMETERS ###############################
>
> # alpha: interaction between son targeting and family size
> # beta : son targeting
> # gamma : family size
> # delta : a 1x6 vector of probabilities of male birth at various parities (q1, q2, q3, q4, q5, q6)
> # zeta : a 1x11 vector of conditional probabilities with zeta[1]=1 always
>
> alpha <- paramets[1] # FIRST PARAMETER
> beta <- paramets[2:9] # SECOND TO SEVENTH PARAMETER
> gamma <- paramets[10:16]
> delta <- paramets[17]
> zeta <- paramets[18:27] # LAST 10 PARAMETERS
>
> ################# VARIABLES ###############################
> # READING IN THE VARIABLES IN THE DATA FRAME
> # AND RENAMING THEM FOR USE IN THE LIKELIHOOD FUNCTION
>
> everborn <- data.frame$v201
> alive <- data.frame$alive
> age <- data.frame$age
> edu <- data.frame$edu
> rural <- data.frame$rur
> rich <- data.frame$rich
> middle <- data.frame$middle
> poor <- data.frame$poor
> work <- data.frame$work
> jointfam <- data.frame$jfam
> contracep <- data.frame$contra
> hindu <- data.frame$hindu
> muslim <- data.frame$muslim
> scaste <- data.frame$scaste
> stribe <- data.frame$stribe
> obc <- data.frame$obc
> ucaste <- data.frame$ucaste
> N <- data.frame$dfsize
> indN <- data.frame$dfsize1 # INDICATOR FUNCTION THAT dfsize==alive
> nb <- data.frame$nboy
> ng <- data.frame$ngirl
> ncord1 <- data.frame$ncord1 # FIRST CHILD: BOY=0; GIRL=1
> ncord2 <- data.frame$ncord2 #SECOND CHILD: BOY=0; GIRL=1
> ncord3 <- data.frame$ncord3
> ncord4 <- data.frame$ncord4
> ncord5 <- data.frame$ncord5
> ncord6 <- data.frame$ncord6 # SIXTH CHILD: BOY=0; GIRL=1
>
>
>
> ######### POSITION OF i^th BOY ################################################
> boy1 <- data.frame$boy1 # BIRTH POSITION OF FIRST BOY (ZERO IF THE FAMILY HAS NO BOYS)
> boy2 <- data.frame$boy2 # BIRTH POSITION OF SECOND BOY (ZERO IF THE FAMILY HAS ONLY ONE BOY)
> boy3 <- data.frame$boy3
> boy4 <- data.frame$boy4
> boy5 <- data.frame$boy5
> boy6 <- data.frame$boy6 # BIRTH POSITION OF SIXTH BOY (ZERO IF THE FAMILY HAS ONLY FIVE BOYS)
>
>
> ######################## CONDITIONAL PROBABILITIES ##########################
> qq21 <- 1
>
> qq31 <- 1/(1+exp(zeta[1]))
> qq32 <- exp(zeta[1])/(1+exp(zeta[1]))
>
> qq41 <- 1/(1+exp(zeta[2])+exp(zeta[3]))
> qq42 <- exp(zeta[2])/(1+exp(zeta[2])+exp(zeta[3]))
> qq43 <- exp(zeta[3])/(1+exp(zeta[2])+exp(zeta[3]))
>
> qq51 <- 1/(1+exp(zeta[4])+exp(zeta[5])+exp(zeta[6]))
> qq52 <- exp(zeta[4])/(1+exp(zeta[4])+exp(zeta[5])+exp(zeta[6]))
> qq53 <- exp(zeta[5])/(1+exp(zeta[4])+exp(zeta[5])+exp(zeta[6]))
> qq54 <- exp(zeta[6])/(1+exp(zeta[4])+exp(zeta[5])+exp(zeta[6]))
>
> qq61 <- 1/(1+exp(zeta[7])+exp(zeta[8])+exp(zeta[9])+exp(zeta[10]))
> qq62 <- exp(zeta[7])/(1+exp(zeta[7])+exp(zeta[8])+exp(zeta[9])+exp(zeta[10]))
> qq63 <- exp(zeta[8])/(1+exp(zeta[7])+exp(zeta[8])+exp(zeta[9])+exp(zeta[10]))
> qq64 <- exp(zeta[9])/(1+exp(zeta[7])+exp(zeta[8])+exp(zeta[9])+exp(zeta[10]))
> qq65 <- exp(zeta[10])/(1+exp(zeta[7])+exp(zeta[8])+exp(zeta[9])+exp(zeta[10]))
>
> zeta1 <- c(qq21,qq31,qq32,qq41,qq42,qq43,qq51,qq52,qq53,qq54,qq61,qq62,qq63,qq64,qq65)
>
> #########################################################################
>
> n <- length(N) # LENGTH OF SAMPLE; SIZE OF THE MAIN LOOP
>
> lglik <- numeric(n) # INITIALIZING THE LOGLIKELIHOOD FUNCTION
> # CREATES A 1xn VECTOR OF ZEROS
>
> for (j in 1:n) { # START OF MAIN LOOP
>
> S <- matrix(0, 6, 6) # CREATE A 6x6 MATRIX OF ZEROS
> y <- numeric(15) # CREATE A 1x15 VECTOR OF ZEROS
> N1 <- N[j] # DESIRED FAMILY SIZE
>
>
> q <- 1/(1+exp(delta)) # PROBABILITY OF MALE BIRTH
>
>
> if (alive[j]==N1) {
>
> for (i in 1:(N1-1)) {
> S[N1,i] <- (q^(nb[j]))*((1-q)^(ng[j]))
> }
> }
>
> else {
> for (i in 1:(N1-1)) {
> S[N1,i] <- 0
> }
>
> }
>
> ######### CREATE A 1x6 VECTOR WITH POSITION OF BOYS WITHIN FAMILY
> x <- c(boy1[j], boy2[j], boy3[j], boy4[j], boy5[j])
>
> if (N1>1) {
> for (i in 1:(N1-1)) {
> if (alive[j]>x[i] & x[i]>0) {
> S[N1,i] <- 0
> }
> if (x[i] == alive[j] ) {
> S[N1,i] <- (q^(nb[j]))*((1-q)^(ng[j]))
> }
> }
> }
>
> y <- c(S[2,1],S[3,1],S[3,2],S[4,1],S[4,2],S[4,3],S[5,1],S[5,2],S[5,3],S[5,4],S[6,1],S[6,2],S[6,3],S[6,4],S[6,5])
>
>
> z1 <- c(age[j],edu[j],work[j],rural[j],poor[j],middle[j],hindu[j]) # DETERMINANTS OF FAMILY SIZE
> z2 <- c(1,age[j],edu[j],work[j],contracep[j],rural[j],jointfam[j],hindu[j]) # DETERMINANTS OF SON TARGETING
>
> t1 <- (indN[j])*((q^(nb[j]))*((1-q)^(ng[j])))*(exp(-exp(sum(z1*gamma)))*((exp(sum(z1*gamma)))^N1)*pnorm(-sum(z2*beta)))/factorial(N1)
> t2 <- (sum(y*zeta1))*(exp(-exp(sum(z1*gamma) + alpha))*((exp(sum(z1*gamma) + alpha))^N1)*(1-pnorm(-sum(z2*beta)))/factorial(N1))
> lglik[j] <- log(t1+t2)
> }
>
> return(-sum(lglik)) # RETURNING THE NEGATIVE OF THE LOGLIKELIHOOD
> # SUMMED OVER ALL OBSERVATIONS
>
>
> }
>
> ------------ end code ----------------------
>
> ______________________________________________
> R-help_at_r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>

-- 
Jim Holtman
Cincinnati, OH
+1 513 646 9390

What is the problem you are trying to solve?

______________________________________________
R-help_at_r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
Received on Mon 03 Dec 2007 - 01:58:28 GMT

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.2.0, at Mon 03 Dec 2007 - 19:30:17 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help. Please read the posting guide before posting to the list.