Negative Binomial Distribution – How to Interpret Zeroinfl Results from Emmeans

lsmeansnegative-binomial-distributiontukey-hsd-testzero inflation

I am working on the example Senecio data from Blasco‐Moreno et al. (2019) using the pscl package in R. I would like to conduct pairwise comparisons of mean rates (Damaged/Total_heads) and don't understand the estimated marginal means for zero-inflated models. This is detailed in the code below.

Inference for the negative binomial on the response scale is straightforward (this code gives rates (Damaged/Total_Heads) and ratios of those):

library(MASS)
library(pscl) # version 1.5.5.1
library(emmeans) # version 1.8.7
dat <- read.csv(file="Blasco-Moreno2019.csv", header=T)
# declare factors and match levels to Blasco-Moreno (2019)
dat$Location <- factor(dat$Location, levels=c("VF", "CB", "CP", "CT", "FM", "SS"))
dat$Species <- factor(dat$Species, levels=c("SV", "SL", "SI", "SP"))
NB <- glm.nb(Damaged ~ offset(log(Total_heads)) + Species + Location, dat)
# the back-transformed means are averaged over the levels of location 
emm.NB <- emmeans(NB, ~ Species, type="response", offset=0, weights="cells")
summary(emm.NB, infer=TRUE)
contrast(emm.NB, method="tukey", infer=TRUE)

> summary(emm.NB, infer=TRUE)
 Species response      SE  df asymp.LCL asymp.UCL null z.ratio p.value
 SV        0.0335 0.00709 Inf    0.0222    0.0508    1 -16.062  <.0001
 SL        0.2300 0.03352 Inf    0.1729    0.3060    1 -10.085  <.0001
 SI        0.0599 0.01148 Inf    0.0412    0.0872    1 -14.702  <.0001
 SP        0.0167 0.00347 Inf    0.0111    0.0250    1 -19.681  <.0001

Results are averaged over the levels of: Location 
Confidence level used: 0.95 
Intervals are back-transformed from the log scale 
Tests are performed on the log scale 

tl;dr (readers may skip to the SOLUTION at the end of this post)

Inference for the zero-inflated negative binomial on the response scale is difficult. Firstly, for the count component:

ZINB2 <- zeroinfl(Damaged ~ offset(log(Total_heads)) + Species + Location | Species, data = dat, dist = "negbin")
# what are these counts?
emm.ZINB2.count <- emmeans(ZINB2, ~ Species, mode="count", offset=0, weights="cells")
summary(emm.ZINB2.count, infer=TRUE)
contrast(emm.ZINB2.count, method="tukey", infer=TRUE)
# these appear to be rates on the log scale
emm.ZINB2.count.lin <- emmeans(ZINB2, ~ Species, mode="count", offset=0, weights="cells", lin.pred=TRUE)
summary(emm.ZINB2.count.lin, infer=TRUE)
# but the pairwise p-values are different
contrast(emm.ZINB2.count.lin, method="tukey", infer=TRUE)
# are these the offsets?
summary(emm.ZINB2.count, infer=TRUE)$emmean / exp(summary(emm.ZINB2.count.lin, infer=TRUE)$emmean)
# compared to
mean(dat$Total_heads)

> summary(emm.ZINB2.count, infer=TRUE)
 Species emmean   SE  df asymp.LCL asymp.UCL z.ratio p.value
 SV        28.2 5.73 Inf      16.9      39.4   4.919  <.0001
 SL        79.8 7.65 Inf      64.8      94.8  10.423  <.0001
 SI        15.8 1.79 Inf      12.3      19.3   8.805  <.0001
 SP        29.5 7.07 Inf      15.7      43.4   4.177  <.0001

> summary(emm.ZINB2.count.lin, infer=TRUE)
 Species emmean     SE  df asymp.LCL asymp.UCL z.ratio p.value
 SV       -2.29 0.2019 Inf     -2.68     -1.89 -11.337  <.0001
 SL       -1.24 0.0915 Inf     -1.42     -1.06 -13.533  <.0001
 SI       -2.85 0.1163 Inf     -3.07     -2.62 -24.478  <.0001
 SP       -2.22 0.2405 Inf     -2.69     -1.75  -9.222  <.0001

Results are averaged over the levels of: Location 
Results are given on the log (not the response) scale. 
Confidence level used: 0.95

> summary(emm.ZINB2.count, infer=TRUE)$emmean / exp(summary(emm.ZINB2.count.lin, infer=TRUE)$emmean)
[1] 277.8002 275.2955 271.5769 271.2817
> mean(dat$Total_heads)
[1] 257.6232

The zero component results are more satisfactory:

# weights are NA becasue zero ~ Species only
emm.ZINB2.zero <- emmeans(ZINB2, ~ Species, mode="zero")
# these are probabilities as expected
summary(emm.ZINB2.zero, infer=TRUE)
# these contrasts are differences in probs
contrast(emm.ZINB2.zero, method="tukey", infer=TRUE)
# these contrasts are differences of qlogis(prob); not log odds ratios
# again, the p-values are different
emm.ZINB2.zero.lin <- emmeans(ZINB2, ~ Species, mode="zero", lin.pred=TRUE)
summary(emm.ZINB2.zero.lin, infer=TRUE)
contrast(emm.ZINB2.zero.lin, method="tukey", infer=TRUE)

> summary(emm.ZINB2.zero, infer=TRUE)
 Species emmean     SE  df asymp.LCL asymp.UCL z.ratio p.value
 SV      0.6927 0.0638 Inf    0.5676     0.818  10.854  <.0001
 SL      0.1928 0.0477 Inf    0.0994     0.286   4.045  0.0001
 SI      0.0435 0.0484 Inf   -0.0514     0.138   0.899  0.3686
 SP      0.8344 0.0421 Inf    0.7518     0.917  19.804  <.0001

> summary(emm.ZINB2.zero.lin, infer=TRUE)
 Species emmean    SE  df asymp.LCL asymp.UCL z.ratio p.value
 SV       0.813 0.300 Inf     0.225     1.400   2.711  0.0067
 SL      -1.432 0.306 Inf    -2.032    -0.831  -4.674  <.0001
 SI      -3.090 1.163 Inf    -5.369    -0.810  -2.657  0.0079
 SP       1.617 0.305 Inf     1.020     2.215   5.303  <.0001

The counts problem reappears for mode="response", which I think estimates the overall means:

# again, what are these counts?
emm.ZINB2 <- emmeans(ZINB2, ~ Species, mode="response", offset=0, weights="cells")
summary(emm.ZINB2, infer=TRUE)
contrast(emm.ZINB2, method="tukey", infer=TRUE)
# no difference if lin.pred=TRUE
emm.ZINB2.lin <- emmeans(ZINB2, ~ Species, mode="response", offset=0, weights="cells", lin.pred=TRUE)
summary(emm.ZINB2.lin, infer=TRUE)
contrast(emm.ZINB2.lin, method="tukey", infer=TRUE)

> summary(emm.ZINB2, infer=TRUE)
 Species emmean   SE  df asymp.LCL asymp.UCL z.ratio p.value
 SV        8.66 1.97 Inf      4.80     12.52   4.396  <.0001
 SL       64.40 6.00 Inf     52.65     76.16  10.741  <.0001
 SI       15.08 1.63 Inf     11.87     18.28   9.226  <.0001
 SP        4.89 1.67 Inf      1.61      8.17   2.921  0.0035

Results are averaged over the levels of: Location 
Confidence level used: 0.95 

It does seem that mode="response" does correctly mix zero and count components according to the formula mean = p*0 + (1 – p)*count.mean = (1 – p)*count.mean where p is from the zero component (probability scale) and count.mean is from the count component (count/rate scale):

# from the count component
offs <- summary(emm.ZINB2.count, infer=TRUE)$emmean / exp(summary(emm.ZINB2.count.lin, infer=TRUE)$emmean)
# are these rates?
summary(emm.ZINB2, infer=TRUE)$emmean / offs
(1-summary(emm.ZINB2.zero, infer=TRUE)$emmean)*exp(summary(emm.ZINB2.count.lin, infer=TRUE)$emmean)

> summary(emm.ZINB2, infer=TRUE)$emmean / offs
[1] 0.03116506 0.23394104 0.05551120 0.01802210
> (1-summary(emm.ZINB2.zero, infer=TRUE)$emmean)*exp(summary(emm.ZINB2.count.lin, infer=TRUE)$emmean)
[1] 0.03116506 0.23394104 0.05551120 0.01802210

I would still like to be able to report pairwise comparisons for the overall mean on the response scale (rate scale = count divided by offset) similar to those results for the NB model above. How?

SOLUTION based on Russell Lenth's advice

From first principles (see the documentation for the emmeans package) and ignoring the weights complication:

RG <- expand.grid(Species=levels(dat$Species), Location=unique(dat$Location), Total_heads=mean(dat$Total_heads)) # reference grid
RG
preds <- matrix(predict(ZINB2, newdata = RG), nrow = 4)
preds # predicted mean counts, Species (rows) x Location (columns)
apply(preds, 1, mean) # row means
# compared to
emm.ZINB2 <- emmeans(ZINB2, ~ Species, mode="response")
emm.ZINB2 # same same

Re-examining the count component for the zero-inflated model:

# mean Species rates on the log scale, averaged across Location, for the count component only
emm.ZINB2.count.lin <- emmeans(ZINB2, ~ Species, mode="count", offset=0, weights="cells", lin.pred=TRUE)
emm.ZINB2.count.lin
emm.ZINB2.count.lin@grid # offset = 0, weights are sample sizes by Species
exp(c(-2.29,-1.24,-2.85,-2.22)) # back transformed rates
# mean Species counts on the log scale, averaged across Location, for the count component only
# removing offset = 0 to get the offset
emm.ZINB2.count.lin <- emmeans(ZINB2, ~ Species, mode="count", weights="cells", lin.pred=TRUE)
emm.ZINB2.count.lin
emm.ZINB2.count.lin@grid # offset = log(mean(dat$Total_heads))
exp(c(3.26, 4.31, 2.70, 3.33))/exp(5.551498) # rates
# mean Species counts, averaged across Location, for the count component only
emm.ZINB2.count <- emmeans(ZINB2, ~ Species, mode="count", weights="cells")
emm.ZINB2.count
emm.ZINB2.count@grid # no offset, weights are sample sizes by Species
c(28.2, 79.8, 15.8, 29.5)/exp(5.551498) # these rates don't quite match those above, why?

Re-examining overall means for the zero-inflated model:

# lin.pred does nothing for zeroinfl and mode="response" for which there are two link functions (log and logit)
# mean Species counts, averaged across Location
emm.ZINB2 <- emmeans(ZINB2, ~ Species, mode="response", offset=0, weights="cells")
emm.ZINB2
emm.ZINB2@grid # offset = 0, weights are sample sizes by Species
# offset also does nothing for zeroinfl and mode="response"
emm.ZINB2 <- emmeans(ZINB2, ~ Species, mode="response", weights="cells")
emm.ZINB2
emm.ZINB2@grid # no offset, weights are sample sizes by Species
c(8.66, 64.40, 15.08, 4.89)/exp(5.551498) # rates, taking the offset from the count component
# contrasts of counts
contrast(emm.ZINB2.lin, method="tukey", infer=TRUE)
# contrasts of rates, using scale as suggested by Russell Lenth
contrast(emm.ZINB2.lin, method="tukey", infer=TRUE, scale = exp(-offs))
# these contrasts are differences rather than ratios like for the NB model
c(8.66-64.40, 8.66-15.08, 8.66-4.89, 64.40-15.08, 64.40-4.89, 15.08-4.89)/exp(5.551498)

Lesson learnt: emmeans parameters offset and lin.pred are not particularly meaningful for zero-inflated models with mode="response". Overall mean counts can be scaled as rates using the offset from mode="count". As commented in the documentation for the emmeans package, statistics is hard.

Reference

Blasco‐Moreno A., Pérez‐Casany M., Puig P., Morante M. & Castells E. (2019). What does a zero mean? Understanding false, random and structural zeros in ecology. Methods in Ecology and Evolution 10: 949–959. https://doi.org/10.1111/2041-210X.13185

Best Answer

If I understand correctly what you did, your emm.ZINB.count.lin object estimates the log of the count component of the model, without the zero inflation. It does not give the offsets. Please note that offsets are fixed quantities -- they are not estimated. If you look at emm.ZINB2.count.lin@grid, you will see extra columns named .wgt. and .offset., and the latter indicates the (fixed) offset for each node in the reference grid. Since these offsets do not depend on Species, it will be constant.

Meanwhile, your emm.ZINB2.lin does estimate the mean counts, accounting for zero inflation. So to get rates, you may do

offs <- emm.ZINB2.count.lin@grid[1, ".offset."]  # this is on the log scale
contrast(emm.ZINB2.lin, method="tukey", infer=TRUE, scale = exp(-offs))

which will divide each contrast by exp(offs)

Related Question