Solved – How to compare two groups with multiple measurements for each individual with R

error-propagationrstatistical significancet-test

I have a problem like the following:

1) There are six measurements for each individual with large within-subject variance

2) There are two groups (Treatment and Control)

3) Each group consists of 5 individuals

4) I want to perform a significance test comparing the two groups to know if the group means are different from one another.

The data looks like this:
http://s10.postimg.org/p9krg6f3t/examp.png

And I have run some simulations using this code which does t tests to compare the group means. The group means were calculated by taking the means of the individual means. This ignores within-subject variability:

 n.simulations<-10000
    pvals=matrix(nrow=n.simulations,ncol=1)
    for(k in 1:n.simulations){
      subject=NULL
      for(i in 1:10){
        subject<-rbind(subject,as.matrix(rep(i,6)))
      }
      #set.seed(42)

      #Sample Subject Means
      subject.means<-rnorm(10,100,2)

      #Sample Individual Measurements
      values=NULL
      for(sm in subject.means){
        values<-rbind(values,as.matrix(rnorm(6,sm,20)))
      }

      out<-cbind(subject,values)

      #Split into GroupA and GroupB
      GroupA<-out[1:30,]
      GroupB<-out[31:60,]

      #Add effect size to GroupA
      GroupA[,2]<-GroupA[,2]+0

      colnames(GroupA)<-c("Subject", "Value")
      colnames(GroupB)<-c("Subject", "Value")

      #Calculate Individual Means and SDS
      GroupA.summary=matrix(nrow=length(unique(GroupA[,1])), ncol=2)
      for(i in 1:length(unique(GroupA[,1]))){
        GroupA.summary[i,1]<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])
        GroupA.summary[i,2]<-sd(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])
      }
      colnames(GroupA.summary)<-c("Mean","SD")


      GroupB.summary=matrix(nrow=length(unique(GroupB[,1])), ncol=2)
      for(i in 1:length(unique(GroupB[,1]))){
        GroupB.summary[i,1]<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])
        GroupB.summary[i,2]<-sd(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])
      }
      colnames(GroupB.summary)<-c("Mean","SD")

      Summary<-rbind(cbind(1,GroupA.summary),cbind(2,GroupB.summary))
      colnames(Summary)[1]<-"Group"

      pvals[k]<-t.test(GroupA.summary[,1],GroupB.summary[,1], var.equal=T)$p.value
    }

And here is code for plots:

#Plots
par(mfrow=c(2,2))
boxplot(GroupA[,2]~GroupA[,1], col="Red", main="Group A", 
        ylim=c(.9*min(out[,2]),1.1*max(out[,2])),
        xlab="Subject", ylab="Value")
stripchart(GroupA[,2]~GroupA[,1], vert=T, pch=16, add=T)
#abline(h=mean(GroupA[,2]), lty=2, lwd=3)

for(i in 1:length(unique(GroupA[,1]))){
  m<-mean(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])
  ci<-t.test(GroupA[which(GroupA[,1]==unique(GroupA[,1])[i]),2])$conf.int[1:2]

  points(i-.2,m, pch=15,cex=1.5, col="Grey")
  segments(i-.2,
           ci[1],i-.2,
           ci[2], lwd=4, col="Grey"
  )
}
legend("topleft", legend=c("Individual Means +/- 95% CI"), bty="n", pch=15, lwd=3, col="Grey")


boxplot(GroupB[,2]~GroupB[,1], col="Light Blue", main="Group B", 
        ylim=c(.9*min(out[,2]),1.1*max(out[,2])),
        xlab="Subject", ylab="Value")
stripchart(GroupB[,2]~GroupB[,1], vert=T, pch=16, add=T)
#abline(h=mean(GroupB[,2]), lty=2, lwd=3)

for(i in 1:length(unique(GroupB[,1]))){
  m<-mean(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])
  ci<-t.test(GroupB[which(GroupB[,1]==unique(GroupB[,1])[i]),2])$conf.int[1:2]

  points(i-.2,m, pch=15,cex=1.5, col="Grey")
  segments(i-.2,
           ci[1],i-.2,
           ci[2], lwd=4, col="Grey"
  )
}
legend("topleft", legend=c("Individual Means +/- 95% CI"), bty="n", pch=15, lwd=3, col="Grey")


boxplot(Summary[,2]~Summary[,1], col=c("Red","Light Blue"), xlab="Group", ylab="Average Value",
        ylim=c(.9*min(Summary[,2]),1.1*max(Summary[,2])),
        main="Individual Averages")
stripchart(Summary[,2]~Summary[,1], vert=T, pch=16, add=T)

points(.9, mean(GroupA.summary[,1]), pch=15,cex=1.5, col="Grey")
segments(.9,
         t.test(GroupA.summary[,1])$conf.int[1],.9,
         t.test(GroupA.summary[,1])$conf.int[2], lwd=4, col="Grey"
)

points(1.9, mean(GroupB.summary[,1]), pch=15,cex=1.5, col="Grey")
segments(1.9,
         t.test(GroupB.summary[,1])$conf.int[1],1.9,
         t.test(GroupB.summary[,1])$conf.int[2], lwd=4, col="Grey"
)
legend("topleft", legend=c("Group Means +/- 95% CI"), bty="n", pch=15, lwd=3, col="Grey")


hist(pvals, breaks=seq(0,1,by=.05), col="Grey",
     main=c(paste("# sims=", n.simulations),
            paste("% Sig p-values=",100*length(which(pvals<0.05))/length(pvals)))
)

Now, it seems to me that because each individual mean is an estimate itself, that we should be less certain about the group means than shown by the 95% confidence intervals indicated by the bottom-left panel in the figure above. Thus the p-values calculated are underestimating the true variability and should lead to increased false-positives if we wish to extrapolate to future data.

So what is the correct way to analyze this data?

Bonus:

The example above is a simplification. For the actual data:

1) The within-subject variance is positively correlated with the mean.

2) Values can only be multiples of two.

3) The individual results are not roughly normally distributed. They suffer from zero floor effect, and have long tails at the positive end.

4) Number of Subjects in each group are not necessarily equal.

Previous literature has used the t-test ignoring within-subject variability and other nuances as was done for the simulations above. Are these results reliable? If I can extract some means and standard errors from the figures how would I calculate the "correct" p-values.

EDIT:

Ok, here is what actual data looks like. There is also three groups rather than two:

enter image description here

dput() of data:

structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 
3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 
6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 
10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 
12, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 
15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 18, 
18, 18, 18, 18, 18, 2, 0, 16, 2, 16, 2, 8, 10, 8, 6, 4, 4, 8, 
22, 12, 24, 16, 8, 24, 22, 6, 10, 10, 14, 8, 18, 8, 14, 8, 20, 
6, 16, 6, 6, 16, 4, 2, 14, 12, 10, 4, 10, 10, 8, 4, 10, 16, 16, 
2, 8, 4, 0, 0, 2, 16, 10, 16, 12, 14, 12, 8, 10, 12, 8, 14, 8, 
12, 20, 8, 14, 2, 4, 8, 16, 10, 14, 8, 14, 12, 8, 14, 4, 8, 8, 
10, 4, 8, 20, 8, 12, 12, 22, 14, 12, 26, 32, 22, 10, 16, 26, 
20, 12, 16, 20, 18, 8, 10, 26), .Dim = c(108L, 3L), .Dimnames = list(
    NULL, c("Group", "Subject", "Value")))

EDIT 2:

In response to Henrik's answer:
So if I instead perform anova followed by TukeyHSD procedure on the individual averages as shown below, I could interpret this as underestimating my p-value by about 3-4x?

My goal with this part of the question is to understand how I, as a reader of a journal article, can better interpret previous results given their choice of analysis method. For example they have those "stars of authority" showing me 0.01>p>.001. So if i accept 0.05 as a reasonable cutoff I should accept their interpretation? The only additional information is mean and SEM.

#Get Invidual Means
summary=NULL
for(i in unique(dat[,2])){
sub<-which(dat[,2]==i)
summary<-rbind(summary,cbind(
dat[sub,1][3],
dat[sub,2][4],
mean(dat[sub,3]),
sd(dat[sub,3])
)
)
}
colnames(summary)<-c("Group","Subject","Mean","SD")

TukeyHSD(aov(summary[,3]~as.factor(summary[,1])+ (1|summary[,2])))

#      Tukey multiple comparisons of means
#        95% family-wise confidence level
#    
#    Fit: aov(formula = summary[, 3] ~ as.factor(summary[, 1]) + (1 | summary[, 2]))
#    
#    $`as.factor(summary[, 1])`
#             diff       lwr       upr     p adj
#    2-1 -0.672619 -4.943205  3.597967 0.9124024
#    3-1  7.507937  1.813822 13.202051 0.0098935
#    3-2  8.180556  2.594226 13.766885 0.0046312

EDIT 3:
I think we are getting close to my understanding. Here is the simulation described in the comments to @Stephane:

#Get Subject Means
means<-aggregate(Value~Group+Subject, data=dat, FUN=mean)

#Initialize "dat2" dataframe
dat2<-dat

#Initialize within-Subject sd
s<-.001
pvals=matrix(nrow=10000,ncol=2)

for(j in 1:10000){
#Sample individual measurements for each subject
temp=NULL
for(i in 1:nrow(means)){
temp<-c(temp,rnorm(6,means[i,3], s))
}

#Set new values
dat2[,3]<-temp

#Take means of sampled values and fit to model
dd2 <- aggregate(Value~Group+Subject, data=dat2, FUN=mean)
fit2 <- lm(Value~Group, data=dd2)

#Save sd and pvalue
pvals[j,]<-cbind(s,anova(fit2)[[5]][5])

#Update sd
s<-s+.001
}

plot(pvals[,1],pvals[,2], xlab="Within-Subject SD", ylab="P-value")

enter image description here

Best Answer

I take the freedom to answer the question in the title, how would I analyze this data.

Given that we have replicates within the samples, mixed models immediately come to mind, which should estimate the variability within each individual and control for it.

Hence I fit the model using lmer from lme4. However, as we are interested in p-values, I use mixed from afex which obtains those via pbkrtest (i.e., Kenward-Rogers approximation for degrees-of-freedom). (afex also already sets the contrast to contr.sum which I would use in such a case anyway)

To control for the zero floor effect (i.e., positive skew), I fit two alternative versions transforming the dependent variable either with sqrt for mild skew and log for stronger skew.

require(afex)

# read the dput() in as dat <- ...    
dat <- as.data.frame(dat)
dat$Group <- factor(dat$Group)
dat$Subject <- factor(dat$Subject)

(model <- mixed(Value ~ Group + (1|Subject), dat))
##        Effect    stat ndf ddf F.scaling p.value
## 1 (Intercept) 237.730   1  15         1  0.0000
## 2       Group   7.749   2  15         1  0.0049

(model.s <- mixed(sqrt(Value) ~ Group + (1|Subject), dat))
##        Effect    stat ndf ddf F.scaling p.value
## 1 (Intercept) 418.293   1  15         1  0.0000
## 2       Group   4.121   2  15         1  0.0375

(model.l <- mixed(log1p(Value) ~ Group + (1|Subject), dat))
##        Effect    stat ndf ddf F.scaling p.value
## 1 (Intercept) 458.650   1  15         1  0.0000
## 2       Group   2.721   2  15         1  0.0981

The effect is significant for the untransformed and sqrt dv. But are these model sensible? Let's plot the residuals.

png("qq.png", 800, 300, units = "px", pointsize = 12)
par(mfrow = c(1, 3))
par(cex = 1.1)
par(mar = c(2, 2, 2, 1)+0.1)
qqnorm(resid(model[[2]]), main = "original")
qqline(resid(model[[2]]))
qqnorm(resid(model.s[[2]]), main = "sqrt")
qqline(resid(model.s[[2]]))
qqnorm(resid(model.l[[2]]), main = "log")
qqline(resid(model.l[[2]]))
dev.off()

enter image description here

It seems that the model with sqrt trasnformation provides a reasonable fit (there still seems to be one outlier, but I will ignore it). So, let's further inspect this model using multcomp to get the comparisons among groups:

require(multcomp)

# using bonferroni-holm correction of multiple comparison
summary(glht(model.s[[2]], linfct = mcp(Group = "Tukey")), test = adjusted("holm"))
##          Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lmer(formula = sqrt(Value) ~ Group + (1 | Subject), data = data)
## 
## Linear Hypotheses:
##            Estimate Std. Error z value Pr(>|z|)  
## 2 - 1 == 0  -0.0754     0.3314   -0.23    0.820  
## 3 - 1 == 0   1.1189     0.4419    2.53    0.023 *
## 3 - 2 == 0   1.1943     0.4335    2.75    0.018 *
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
## (Adjusted p values reported -- holm method)

# using default multiple comparison correction (which I don't understand)
summary(glht(model.s[[2]], linfct = mcp(Group = "Tukey")))
##          Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lmer(formula = sqrt(Value) ~ Group + (1 | Subject), data = data)
## 
## Linear Hypotheses:
##            Estimate Std. Error z value Pr(>|z|)  
## 2 - 1 == 0  -0.0754     0.3314   -0.23    0.972  
## 3 - 1 == 0   1.1189     0.4419    2.53    0.030 *
## 3 - 2 == 0   1.1943     0.4335    2.75    0.016 *
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
## (Adjusted p values reported -- single-step method)

Punchline: group 3 differs from the other two groups which do not differ among each other.

Related Question