Regression – How to Interpret Polynomial Regression Bootstrap Output?

bootstrapconfidence intervalnonlinear regressionpolynomialregression

I have the following data:

structure(list(Value= c(0.228881, 0.26864, 0.2746545, 0.2588095, 
0.2212295, 0.262956, 0.239068, 0.2645695, 0.270166, 0.2464375, 
0.2488355, 0.260082, 0.228691, 0.2642105, 0.255971, 0.236502, 
0.316483, 0.2635265, 0.2293235, 0.2490335, 0.277711, 0.3510685, 
0.261011, 0.2491845, 0.2761005, 0.2556445, 0.2212895, 0.2502325, 
0.251632, 0.2535995, 0.2646355, 0.250128, 0.2628675, 0.2525865, 
0.267544, 0.243567, 0.2433465, 0.262231, 0.267147, 0.254661, 
0.2522155, 0.261094, 0.260184, 0.2574835, 0.3249415, 0.2602425, 
0.2749145, 0.235879, 0.259291, 0.2519655, 0.260457, 0.2461125, 
0.3238965, 0.2454995, 0.2547415, 0.276334, 0.26388, 0.229455, 
0.375393, 0.2816165, 0.2516335, 0.245579, 0.3038465, 0.3206155, 
0.2868495, 0.2556445, 0.263657, 0.334416, 0.317611, 0.31837, 
0.3248685, 0.3390745, 0.314509, 0.3346625, 0.3089435, 0.3258815, 
0.3290105, 0.327173, 0.3170855, 0.2896965, 0.2958815, 0.297433, 
0.333545, 0.3231185, 0.3358645, 0.361661, 0.3254935, 0.296155, 
0.3181945, 0.2869195, 0.3351975, 0.3051585, 0.314985, 0.315691, 
0.332565, 0.2660795, 0.3195265, 0.307352, 0.29095, 0.29972, 0.302007, 
0.3325525, 0.318101, 0.2955135, 0.3089395, 0.3481915, 0.2785565, 
0.3471645, 0.3290795, 0.315884, 0.311812, 0.251254, 0.3061215, 
0.269718, 0.334314, 0.3195015, 0.2758345, 0.3181295, 0.3232825, 
0.327678, 0.3539715, 0.3066505, 0.307193, 0.3124935, 0.224277, 
0.302592, 0.327028, 0.30327, 0.307637, 0.28713, 0.312529, 0.330831, 
0.2510545, 0.307193, 0.361899, 0.231769, 0.297745, 0.350601, 
0.2862895, 0.337318, 0.316977, 0.2623725, 0.300464, 0.302148, 
0.321019, 0.335075, 0.3061115, 0.3245545, 0.334073, 0.3318655, 
0.3464885, 0.348252, 0.3281445, 0.301745, 0.314267, 0.3341985, 
0.3437265, 0.28315, 0.3545635, 0.3188615, 0.3351165, 0.29006, 
0.344683, 0.328796, 0.3333425, 0.2996415, 0.329305, 0.294367, 
0.3512895, 0.312173, 0.3617735, 0.333975, 0.320697, 0.317978, 
0.309712, 0.307166, 0.31381, 0.330554, 0.299173, 0.3284935, 0.332073, 
0.3212095, 0.3375295, 0.2990235, 0.323863, 0.31427, 0.366324, 
0.3212675, 0.296698, 0.3450765, 0.2904305, 0.347157, 0.3595685, 
0.3391065, 0.315691, 0.2837855, 0.3240025, 0.326163, 0.3245175, 
0.3418675, 0.2913055, 0.298335, 0.3394605, 0.344435, 0.284265, 
0.3228975, 0.3116815, 0.3157865, 0.31368, 0.3133025, 0.286124, 
0.313569, 0.3510455, 0.3255705, 0.3203715, 0.3242275, 0.343879, 
0.3436655, 0.2713715, 0.3381005, 0.3385825, 0.3190355, 0.3597505, 
0.3271475, 0.320138, 0.3115485, 0.3276145, 0.33251, 0.3437875, 
0.3158815, 0.3218285, 0.326796, 0.3061665, 0.3622365, 0.3555745, 
0.3250755, 0.321343, 0.337907, 0.3159475, 0.3190175, 0.301024, 
0.3316655, 0.3446295, 0.3135155, 0.3220135, 0.3405115, 0.3710685, 
0.2866515, 0.300246, 0.3302455, 0.3141975, 0.332256, 0.3296115, 
0.321672, 0.287862, 0.354463, 0.321344, 0.304759, 0.337911, 0.2871595, 
0.3248345, 0.3188435, 0.3068095, 0.3741625, 0.336544, 0.31421, 
0.351038, 0.3517805, 0.343052, 0.3070755, 0.3310035, 0.3682345, 
0.3137875, 0.328222, 0.3308255, 0.309383, 0.3096755, 0.299197, 
0.3684665, 0.3453295, 0.3565655, 0.3318945, 0.3180955, 0.356223, 
0.331051, 0.2844205, 0.316385, 0.347013, 0.330784, 0.326344, 
0.3518635, 0.3122595, 0.318758, 0.3130085, 0.340933, 0.337239, 
0.320081, 0.3142475, 0.34704, 0.2590365, 0.31095, 0.317086, 0.3551175, 
0.31727, 0.342804, 0.271582, 0.2907645, 0.3480505, 0.3033985, 
0.3154525, 0.3431455, 0.2930245, 0.321643, 0.3315875, 0.328915, 
0.317671, 0.2761495, 0.316245, 0.336618, 0.2994025, 0.318245, 
0.321339, 0.3258745, 0.298861, 0.3034705), Age = c(83L, 26L, 
26L, 20L, 84L, 20L, 23L, 77L, 32L, 14L, 21L, 9L, 15L, 76L, 18L, 
21L, 15L, 75L, 34L, 81L, 81L, 15L, 24L, 24L, 16L, 27L, 7L, 30L, 
31L, 24L, 24L, 31L, 79L, 30L, 19L, 78L, 25L, 20L, 42L, 62L, 83L, 
79L, 18L, 26L, 66L, 23L, 83L, 21L, 77L, 80L, 24L, 57L, 42L, 32L, 
76L, 85L, 29L, 77L, 65L, 79L, 9L, 34L, 11L, 16L, 9L, 21L, 16L, 
34L, 22L, 19L, 23L, 25L, 14L, 53L, 28L, 79L, 22L, 22L, 21L, 82L, 
81L, 16L, 19L, 77L, 15L, 18L, 15L, 78L, 24L, 16L, 14L, 29L, 18L, 
50L, 17L, 43L, 8L, 14L, 85L, 31L, 20L, 30L, 23L, 78L, 29L, 6L, 
61L, 14L, 22L, 10L, 83L, 15L, 13L, 15L, 15L, 29L, 8L, 9L, 15L, 
8L, 9L, 15L, 9L, 34L, 8L, 9L, 9L, 16L, 8L, 25L, 21L, 23L, 13L, 
56L, 10L, 7L, 27L, 8L, 8L, 8L, 8L, 80L, 80L, 6L, 15L, 42L, 25L, 
23L, 21L, 8L, 11L, 43L, 69L, 34L, 34L, 14L, 12L, 10L, 22L, 78L, 
16L, 76L, 12L, 10L, 16L, 6L, 13L, 66L, 11L, 26L, 12L, 16L, 13L, 
24L, 76L, 10L, 65L, 20L, 13L, 25L, 14L, 12L, 15L, 43L, 51L, 27L, 
15L, 24L, 34L, 63L, 17L, 15L, 9L, 12L, 17L, 82L, 75L, 24L, 44L, 
69L, 11L, 10L, 12L, 10L, 10L, 70L, 54L, 45L, 42L, 84L, 54L, 23L, 
23L, 14L, 81L, 17L, 42L, 44L, 16L, 15L, 43L, 45L, 50L, 53L, 23L, 
53L, 49L, 13L, 69L, 14L, 65L, 14L, 13L, 22L, 67L, 59L, 52L, 54L, 
44L, 78L, 62L, 69L, 10L, 63L, 57L, 22L, 12L, 62L, 9L, 82L, 53L, 
54L, 66L, 49L, 63L, 51L, 9L, 45L, 49L, 77L, 49L, 61L, 62L, 57L, 
67L, 16L, 65L, 75L, 45L, 16L, 55L, 17L, 64L, 67L, 56L, 52L, 63L, 
10L, 62L, 14L, 66L, 68L, 15L, 13L, 43L, 47L, 55L, 69L, 21L, 67L, 
34L, 52L, 15L, 31L, 64L, 55L, 44L, 13L, 48L, 71L, 64L, 13L, 25L, 
34L, 50L, 61L, 70L, 33L, 57L, 51L, 46L, 57L, 69L, 46L, 8L, 11L, 
46L, 71L, 33L, 38L, 56L, 17L, 29L, 28L, 6L)), row.names = c(NA, 
-325L), class = c("tbl_df", "tbl", "data.frame"))

To determine, if a linear or polynomial is a better fit for my data, I used the following:

#test which model is best fit
lm1 <- lm(Value~ Age, data= DF)
lm2 <- lm(Value~ poly(Age,2, raw=T), data=DF)
lm3 <- lm(Value~ poly(Age,3, raw=T), data=DF) # best model?
lm4 <- lm(Value~ poly(Age,4, raw=T), data=DF)

And tested which model was best with anova comparison.

anova(lm1, lm2, test = "Chisq")
anova(lm2, lm3, test = "Chisq")
anova(lm3, lm4, test = "Chisq")

The third model yields:

Call:
lm(formula = Value ~ poly(Age, 3, raw = T), data = DF)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.09892 -0.02047  0.00648  0.02405  0.06287 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.413e-01  1.038e-02  32.894  < 2e-16 ***
poly(Age, 3, raw = T)1 -3.763e-03  1.006e-03  -3.740 0.000218 ***
poly(Age, 3, raw = T)2  1.126e-04  2.581e-05   4.364 1.72e-05 ***
poly(Age, 3, raw = T)3 -9.280e-07  1.915e-07  -4.846 1.96e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.03216 on 321 degrees of freedom
Multiple R-squared:  0.09686,   Adjusted R-squared:  0.08842 
F-statistic: 11.48 on 3 and 321 DF,  p-value: 3.628e-07

I can then plot the following:

ggplot(DF) +
  geom_jitter(aes(x = Age, y = Value), alpha = 0.5) +
  geom_smooth(aes(x=Age, y=Value), method='lm', formula = y ~ poly(x,3))

poly

The plot suggests that older adults have less Value than younger adults. To test if this is representative of a general population, I performed a bootstrap and included the cubic polynomial fit:

boot_lm = do(10000) * lm(Value~ poly(Age, 3, raw = T), data=mosaic::resample(DF))

#see confident interval (95%)
confint(boot_lm, level = 0.95)

This provides a 95% CI of:

                    name         lower         upper level     method      estimate
1              Intercept  3.185234e-01  3.650911e-01  0.95 percentile  3.224364e-01
2 poly.Age..3..raw...T.1 -6.027114e-03 -1.609017e-03  0.95 percentile -1.759840e-03
3 poly.Age..3..raw...T.2  5.801781e-05  1.691263e-04  0.95 percentile  5.910076e-05
4 poly.Age..3..raw...T.3 -1.333764e-06 -5.369094e-07  0.95 percentile -5.276421e-07
5                  sigma  2.954775e-02  3.425427e-02  0.95 percentile  3.044845e-02
6              r.squared  5.060619e-02  1.740023e-01  0.95 percentile  6.859062e-02
7                      F  5.703494e+00  2.254032e+01  0.95 percentile  7.879668e+00

My questions are:

  1. Did I correctly identify that a poly(Age,3) model was most appropriate?
  2. If so was the polynomial correctly added to the bootstrap?
  3. Lastly, can I state something like:
    • "We are 95% confident that in the true population, adults show a
      non-linear trend in Value overtime, with a larger decline in late life."

Best Answer

Did I correctly identify that a poly(Age,3) model was most appropriate?

I don't think so. There may be some issues of multiple testing here because you've done several likelihood ratio tests.

A better approach would be to have a linear model vs a spline model. Here is how to do this. I'll fit a model using natural splines with knots placed at the percentiles 5, 27.5, 50, 72.5, and 95 as per Frank Harrell's suggestions in his book Regression Modelling Strategies.

library(splines)

# Place the knots at the indicated quantiles, as per Frank Harrell's suggestions

knot_locs = quantile(d$Age, c(0.05, 0.275, 0.5, 0.725, 0.95))
fit1 = lm(Value ~ Age, data = d)
fit2 = lm(Value ~ ns(Age, knots = knot_locs), data = d)

anova(fit1, fit2)
Analysis of Variance Table

Model 1: Value ~ Age
Model 2: Value ~ ns(Age, knots = knot_locs)
  Res.Df     RSS Df Sum of Sq      F   Pr(>F)    
1    323 0.36426                                 
2    318 0.31802  5  0.046232 9.2456 3.12e-08 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Using a spline is a lower bias way to estimate a non-linear effect while keeping complexity of the model low. Here, I've used 5 knots for the spline, but you are free to use fewer or more. The F test here favors the spline model. We can plot it easily.

library(tidyverse)
library(modelr)

data_grid(d, Age) %>% 
  add_predictions(fit2) %>% 
  ggplot(aes(Age, pred))+
  geom_point(data=d, aes(Age, Value))+
  geom_line(color = 'red', size=1)

enter image description here

I think this approach, should the assumptions of the linear model be met (looks like they are to me from residual diagnostics), would eliminate the need for bootstrapping.

Lastly, can I state something like:

"We are 95% confident that in the true population, adults show a non-linear trend in Value overtime, with a larger decline in late life."

Uh...no, that is not typically something we would say. I might say something like...

"A spline model with 5 degrees of freedom is favoured over a model with a linear effect of age (F Value 9.24 on 6 and 318 degrees of freedom, p<0.001). Our model estimates that the expected Value should decline as individuals become older than 60".

Making statements about the decline specifically is...doable, but may take some care. I think here is where I would bootstrap and perhaps look at the derivative of the models. You could say "In X bootstrap replications, Y% of models had a negative derivative after age Z" or something, but I'm not confident. In any case, you would just have to be very careful about what you say. I'm not sure it would be worth it.

Edit: I agree with Whuber that the bootstrap is not necessary and that a joint confidence region gives information about the trend, but disagree that a confidence band is a good way to visualize the uncertainty about this particular problem. Shown below is a plot of the confidence band. Note that the model is more uncertain about the prediction in the right tail, that is not surprising. But from this plot alone, we can't tell if that increase in uncertainty is consistent with all models from the sampling distribution having negative derivative in the tail. For that, the bootstrap could help.

enter image description here

Here, I have replayed the model fit (including selection of knots, at the indicated percentiles) 250 times and plotted only the relevant part of the model. The overwhelming majority of samples have negative derivative, but a few don't. This plot, I argue, is more informative than the confidence band alone because we could visualize (and if we wanted to, estimate) how many bootstrap samples have negative first derivative in the tail. I leave that to you should you find that approach enticing.

enter image description here

Code


library(splines)
library(tidyverse)
library(rsample)
library(modelr)


d = # Your data here

do_fit<-function(data){
  model_data = assessment(data)
  knot_locs = quantile(model_data$Age, c(0.05, 0.275, 0.5, 0.725, 0.95))
  model = lm(Value ~ ns(Age, knots = knot_locs), data = model_data)
  model
}

do_pred<-function(data, model){
  data_grid(data, Age=seq(60, 100, 0.1))  %>% 
    add_predictions(model) 
}

# Estimate the derivative using finite differences
do_deriv<-function(data, model){
 data_grid(data, Age=seq(60, 100, 0.1))  %>% 
    add_predictions(model) %>% 
    mutate(deriv = (pred - lag(pred))/0.1)
}


straps = bootstraps(d, times=250) %>% 
  mutate(
    fits = map(splits, do_fit),
    preds = map(fits, ~do_pred(d, .x)),
    deriv = map(fits, ~do_deriv(d, .x))
  )


straps %>% 
  unnest(preds) %>% 
  ggplot(aes(Age, pred, group=id))+
  geom_line(alpha = 0.5)
  


straps %>% 
  unnest(deriv) %>% 
  ggplot(aes(Age, deriv, group=id))+
  geom_line(alpha = 0.5)
```