Solved – lmer with multiply imputed data

lme4-nlmemultiple-imputationr

How can I get pooled random effects for lmer after multiple imputation?

I am using mice to multiple impute a dataframe. And lme4 for a mixed model with random intercept and random slope. Pooling lmer goes fine, except that it doesn't pool the random effects. I have searched a lot for a solution with out any luck. I tried the mi package, however I only see pooled output for the estimate and std.error. I've tried exporting mice object to spss without any luck. I saw some discussion on Zelig. I thought that might solve my problem. I was however unable to figure out how to use the package with imputed data for lmer.

I know the mice package only supports pooling the fixed effects. Is there a work around?

Multiple imputation:

library(mice)
Data <- subset(Data0, select=c(id, faculty, gender, age, age_sqr, occupation, degree, private_sector, overtime, wage))
ini <- mice(Data, maxit=0, pri=F) #get predictor matrix
pred <- ini$pred
    pred[,"id"] <- 0 #don't use id as predictor
    meth <- ini$meth
meth[c("id", "faculty", "gender", "age", "age_sqr", "occupation", "degree", "private_sector", "overtime", "wage")] <- "" #don't impute these variables, use only as predictors.
imp <- mice(Data, m=22, maxit=10, printFlag=TRUE, pred=pred, meth=meth) #impute Data with 22 imputations and 10 iterations. 

Multilevel model:

library(lme4)
    fm1 <- with(imp, lmer(log(wage) ~ gender + age + age_sqr + occupation + degree + private_sector + overtime + (1+gender|faculty))) #my multilevel model
    summary(est <- pool(fm1)) #pool my results

Update
Results from pooled lmer:

> summary(est <- pool(fm1))
                                est           se            t       df     Pr(>|t|)         lo 95         hi 95 nmis       fmi    lambda
(Intercept)   7,635148e+00 0,1749178710 43,649905006 212,5553 0,000000e+00  7,2903525425  7,9799443672   NA 0,2632782 0,2563786
Gender        -1,094186e-01 0,0286629154 -3,817427078 117,1059 2,171066e-04 -0,1661834550 -0,0526537238   NA 0,3846276 0,3742069
Occupation1   1,125022e-01 0,0250082538  4,498601518 157,6557 1,320753e-05  0,0631077322  0,1618966049   NA 0,3207350 0,3121722
Occupation2   2,753089e-02 0,0176032487  1,563966385 215,6197 1,192919e-01 -0,0071655902  0,0622273689   NA 0,2606725 0,2538465
Occupation3   1,881908e-04 0,0221992053  0,008477365 235,3705 9,932433e-01 -0,0435463305  0,0439227120   NA 0,2449795 0,2385910
Age           1,131147e-02 0,0087366178  1,294719230 187,0021 1,970135e-01 -0,0059235288  0,0285464629    0 0,2871640 0,2795807
Age_sqr       -7,790476e-05 0,0001033263 -0,753968159 185,4630 4,518245e-01 -0,0002817508  0,0001259413    0 0,2887420 0,2811131
Overtime      -2,376501e-03 0,0004065466 -5,845581504 243,3563 1,614693e-08 -0,0031773002 -0,0015757019    9 0,2391179 0,2328903
Private_sector  8,322438e-02 0,0203047665  4,098760934 371,9971 5,102752e-05  0,0432978716  0,1231508962   NA 0,1688478 0,1643912

This information is missing, which I get when running lmer without multiple imputation:

Random effects:
 Groups   Name        Variance Std.Dev. Corr
 Faculty  (Intercept) 0,008383 0,09156      
          Genderfemale0,002240 0,04732  1,00
 Residual             0,041845 0,20456      
Number of obs: 698, groups:  Faculty, 17

Best Answer

You can do this somewhat by hand if by taking advantage of the lapply functionality in R and the list-structure returned by the Amelia multiple imputation package. Here's a quick example script.

library(Amelia)
library(lme4)
library(merTools)
library(plyr) # for collapsing estimates

Amelia is similar to mice so you can just substitute your variables in from the mice call here -- this example is from a project I was working on.

 a.out <- amelia(dat[sub1, varIndex], idvars = "SCH_ID", 
            noms = varIndex[!varIndex %in% c("SCH_ID", "math12")], 
            m = 10)

a.out is the imputation object, now we need to run the model on each imputed dataset. To do this, we use the lapply function in R to repeat a function over list elements. This function applies the function -- which is the model specification -- to each dataset (d) in the list and returns the results in a list of models.

 mods <- lapply(a.out$imputations,
           function(d) lmer((log(wage) ~ gender + age + age_sqr + 
            occupation + degree + private_sector + overtime + 
             (1+gender|faculty), data = d)

Now we create a data.frame from that list, by simulating the values the fixed and random effects using the functions FEsim and REsim from the merTools package

imputeFEs <- ldply(mods, FEsim, nsims = 1000)
imputeREs <- ldply(mods, REsim, nsims = 1000)

The data.frames above include separate estimates for each dataset, now we need to combine them using a collapse like argument collapse

imputeREs <- ddply(imputeREs, .(X1, X2), summarize, mean = mean(mean), 
               median = mean(median), sd = mean(sd), 
               level = level[1])

imputeFEs <- ddply(imputeFEs, .(var), summarize, meanEff = mean(meanEff), 
               medEff = mean(medEff), sdEff = mean(sdEff))

Now we can also extract some statistics on the variance/covariance for the random effects across the imputed values. Here I have written two simple extractor functions to do this.

REsdExtract <- function(model){
  out <- unlist(lapply(VarCorr(model), attr, "stddev"))
  return(out)
}

REcorrExtract <- function(model){
  out <- unlist(lapply(VarCorr(model), attr, "corre"))
  return(min(unique(out)))
}

And now we can apply them to the models and store them as a vector:

modStats <- cbind(ldply(mods, REsdExtract), ldply(mods, REcorrExtract))

Update

The functions below will get you much closer to the output provided by arm::display by operating on the list of lmer or glmer objects. Hopefully this will be incorporated into the merTools package in the near future:

# Functions to extract standard deviation of random effects from model
REsdExtract <- function(model){
  out <- unlist(lapply(VarCorr(model), attr, "stddev"))
  return(out)
}

#slope intercept correlation from model
REcorrExtract <- function(model){
  out <- unlist(lapply(VarCorr(model), attr, "corre"))
  return(min(unique(out)))
}

modelRandEffStats <- function(modList){
  SDs <- ldply(modList, REsdExtract)
  corrs <- ldply(modList, REcorrExtract)
  tmp <- cbind(SDs, corrs)
  names(tmp) <- c("Imp", "Int", "Slope", "id", "Corr")
  out <- data.frame(IntSD_mean = mean(tmp$Int), 
                        SlopeSD_mean = mean(tmp$Slope), 
                    Corr_mean = mean(tmp$Corr), 
                        IntSD_sd = sd(tmp$Int),
                    SlopeSD_sd = sd(tmp$Slope), 
                        Corr_sd = sd(tmp$Corr))
  return(out)
}

modelFixedEff <- function(modList){
  require(broom)
  fixEst <- ldply(modList, tidy, effects = "fixed")
  # Collapse
  out <- ddply(fixEst, .(term), summarize,
               estimate = mean(estimate), 
               std.error = mean(std.error))
  out$statistic <- out$estimate / out$std.error
  return(out)
}

print.merModList <- function(modList, digits = 3){
  len <- length(modList)
  form <- modList[[1]]@call
  print(form)
  cat("\nFixed Effects:\n")
  fedat <- modelFixedEff(modList)
  dimnames(fedat)[[1]] <- fedat$term
  pfround(fedat[-1, -1], digits)
  cat("\nError Terms Random Effect Std. Devs\n")
  cat("and covariances:\n")
  cat("\n")
  ngrps <- length(VarCorr(modmathG[[1]]))
  errorList <- vector(mode = 'list', length = ngrps)
  corrList <- vector(mode = 'list', length = ngrps)
  for(i in 1:ngrps){
    subList <- lapply(modList, function(x) VarCorr(x)[[i]])
    subList <- apply(simplify2array(subList), 1:2, mean)
    errorList[[i]] <- subList
    subList <- lapply(modList, function(x) attr(VarCorr(x)[[i]], "corre"))
    subList <- min(unique(apply(simplify2array(subList), 1:2, function(x) mean(x))))
    corrList[[i]] <- subList
  }
  errorList <- lapply(errorList, function(x) {
    diag(x) <- sqrt(diag(x))
    return(x)
    })

  lapply(errorList, pfround, digits)
  cat("\nError Term Correlations:\n")
  lapply(corrList, pfround, digits)
  residError <- mean(unlist(lapply(modList, function(x) attr(VarCorr(x), "sc"))))
  cat("\nResidual Error =", fround(residError,
                                             digits), "\n")
  cat("\n---Groups\n")
  ngrps <- lapply(modList[[1]]@flist, function(x) length(levels(x)))
  modn <- getME(modList[[1]], "devcomp")$dims["n"]
  cat(sprintf("number of obs: %d, groups: ", modn))
  cat(paste(paste(names(ngrps), ngrps, sep = ", "),
            collapse = "; "))
  cat("\n")
  cat("\nModel Fit Stats")
  mAIC <- mean(unlist(lapply(modList, AIC)))
  cat(sprintf("\nAIC = %g", round(mAIC, 1)))
  moDsigma.hat <- mean(unlist(lapply(modmathG, sigma)))
  cat("\nOverdispersion parameter =", fround(moDsigma.hat,
                                             digits), "\n")
}
Related Question