Regression – How to Interpret Polynomial Regression Bootstrap Output?

bootstrapconfidence intervalnonlinear regressionpolynomialregression

I have the following data:

structure(list(Value= c(0.228881, 0.26864, 0.2746545, 0.2588095, 
0.2212295, 0.262956, 0.239068, 0.2645695, 0.270166, 0.2464375, 
0.2488355, 0.260082, 0.228691, 0.2642105, 0.255971, 0.236502, 
0.316483, 0.2635265, 0.2293235, 0.2490335, 0.277711, 0.3510685, 
0.261011, 0.2491845, 0.2761005, 0.2556445, 0.2212895, 0.2502325, 
0.251632, 0.2535995, 0.2646355, 0.250128, 0.2628675, 0.2525865, 
0.267544, 0.243567, 0.2433465, 0.262231, 0.267147, 0.254661, 
0.2522155, 0.261094, 0.260184, 0.2574835, 0.3249415, 0.2602425, 
0.2749145, 0.235879, 0.259291, 0.2519655, 0.260457, 0.2461125, 
0.3238965, 0.2454995, 0.2547415, 0.276334, 0.26388, 0.229455, 
0.375393, 0.2816165, 0.2516335, 0.245579, 0.3038465, 0.3206155, 
0.2868495, 0.2556445, 0.263657, 0.334416, 0.317611, 0.31837, 
0.3248685, 0.3390745, 0.314509, 0.3346625, 0.3089435, 0.3258815, 
0.3290105, 0.327173, 0.3170855, 0.2896965, 0.2958815, 0.297433, 
0.333545, 0.3231185, 0.3358645, 0.361661, 0.3254935, 0.296155, 
0.3181945, 0.2869195, 0.3351975, 0.3051585, 0.314985, 0.315691, 
0.332565, 0.2660795, 0.3195265, 0.307352, 0.29095, 0.29972, 0.302007, 
0.3325525, 0.318101, 0.2955135, 0.3089395, 0.3481915, 0.2785565, 
0.3471645, 0.3290795, 0.315884, 0.311812, 0.251254, 0.3061215, 
0.269718, 0.334314, 0.3195015, 0.2758345, 0.3181295, 0.3232825, 
0.327678, 0.3539715, 0.3066505, 0.307193, 0.3124935, 0.224277, 
0.302592, 0.327028, 0.30327, 0.307637, 0.28713, 0.312529, 0.330831, 
0.2510545, 0.307193, 0.361899, 0.231769, 0.297745, 0.350601, 
0.2862895, 0.337318, 0.316977, 0.2623725, 0.300464, 0.302148, 
0.321019, 0.335075, 0.3061115, 0.3245545, 0.334073, 0.3318655, 
0.3464885, 0.348252, 0.3281445, 0.301745, 0.314267, 0.3341985, 
0.3437265, 0.28315, 0.3545635, 0.3188615, 0.3351165, 0.29006, 
0.344683, 0.328796, 0.3333425, 0.2996415, 0.329305, 0.294367, 
0.3512895, 0.312173, 0.3617735, 0.333975, 0.320697, 0.317978, 
0.309712, 0.307166, 0.31381, 0.330554, 0.299173, 0.3284935, 0.332073, 
0.3212095, 0.3375295, 0.2990235, 0.323863, 0.31427, 0.366324, 
0.3212675, 0.296698, 0.3450765, 0.2904305, 0.347157, 0.3595685, 
0.3391065, 0.315691, 0.2837855, 0.3240025, 0.326163, 0.3245175, 
0.3418675, 0.2913055, 0.298335, 0.3394605, 0.344435, 0.284265, 
0.3228975, 0.3116815, 0.3157865, 0.31368, 0.3133025, 0.286124, 
0.313569, 0.3510455, 0.3255705, 0.3203715, 0.3242275, 0.343879, 
0.3436655, 0.2713715, 0.3381005, 0.3385825, 0.3190355, 0.3597505, 
0.3271475, 0.320138, 0.3115485, 0.3276145, 0.33251, 0.3437875, 
0.3158815, 0.3218285, 0.326796, 0.3061665, 0.3622365, 0.3555745, 
0.3250755, 0.321343, 0.337907, 0.3159475, 0.3190175, 0.301024, 
0.3316655, 0.3446295, 0.3135155, 0.3220135, 0.3405115, 0.3710685, 
0.2866515, 0.300246, 0.3302455, 0.3141975, 0.332256, 0.3296115, 
0.321672, 0.287862, 0.354463, 0.321344, 0.304759, 0.337911, 0.2871595, 
0.3248345, 0.3188435, 0.3068095, 0.3741625, 0.336544, 0.31421, 
0.351038, 0.3517805, 0.343052, 0.3070755, 0.3310035, 0.3682345, 
0.3137875, 0.328222, 0.3308255, 0.309383, 0.3096755, 0.299197, 
0.3684665, 0.3453295, 0.3565655, 0.3318945, 0.3180955, 0.356223, 
0.331051, 0.2844205, 0.316385, 0.347013, 0.330784, 0.326344, 
0.3518635, 0.3122595, 0.318758, 0.3130085, 0.340933, 0.337239, 
0.320081, 0.3142475, 0.34704, 0.2590365, 0.31095, 0.317086, 0.3551175, 
0.31727, 0.342804, 0.271582, 0.2907645, 0.3480505, 0.3033985, 
0.3154525, 0.3431455, 0.2930245, 0.321643, 0.3315875, 0.328915, 
0.317671, 0.2761495, 0.316245, 0.336618, 0.2994025, 0.318245, 
0.321339, 0.3258745, 0.298861, 0.3034705), Age = c(83L, 26L, 
26L, 20L, 84L, 20L, 23L, 77L, 32L, 14L, 21L, 9L, 15L, 76L, 18L, 
21L, 15L, 75L, 34L, 81L, 81L, 15L, 24L, 24L, 16L, 27L, 7L, 30L, 
31L, 24L, 24L, 31L, 79L, 30L, 19L, 78L, 25L, 20L, 42L, 62L, 83L, 
79L, 18L, 26L, 66L, 23L, 83L, 21L, 77L, 80L, 24L, 57L, 42L, 32L, 
76L, 85L, 29L, 77L, 65L, 79L, 9L, 34L, 11L, 16L, 9L, 21L, 16L, 
34L, 22L, 19L, 23L, 25L, 14L, 53L, 28L, 79L, 22L, 22L, 21L, 82L, 
81L, 16L, 19L, 77L, 15L, 18L, 15L, 78L, 24L, 16L, 14L, 29L, 18L, 
50L, 17L, 43L, 8L, 14L, 85L, 31L, 20L, 30L, 23L, 78L, 29L, 6L, 
61L, 14L, 22L, 10L, 83L, 15L, 13L, 15L, 15L, 29L, 8L, 9L, 15L, 
8L, 9L, 15L, 9L, 34L, 8L, 9L, 9L, 16L, 8L, 25L, 21L, 23L, 13L, 
56L, 10L, 7L, 27L, 8L, 8L, 8L, 8L, 80L, 80L, 6L, 15L, 42L, 25L, 
23L, 21L, 8L, 11L, 43L, 69L, 34L, 34L, 14L, 12L, 10L, 22L, 78L, 
16L, 76L, 12L, 10L, 16L, 6L, 13L, 66L, 11L, 26L, 12L, 16L, 13L, 
24L, 76L, 10L, 65L, 20L, 13L, 25L, 14L, 12L, 15L, 43L, 51L, 27L, 
15L, 24L, 34L, 63L, 17L, 15L, 9L, 12L, 17L, 82L, 75L, 24L, 44L, 
69L, 11L, 10L, 12L, 10L, 10L, 70L, 54L, 45L, 42L, 84L, 54L, 23L, 
23L, 14L, 81L, 17L, 42L, 44L, 16L, 15L, 43L, 45L, 50L, 53L, 23L, 
53L, 49L, 13L, 69L, 14L, 65L, 14L, 13L, 22L, 67L, 59L, 52L, 54L, 
44L, 78L, 62L, 69L, 10L, 63L, 57L, 22L, 12L, 62L, 9L, 82L, 53L, 
54L, 66L, 49L, 63L, 51L, 9L, 45L, 49L, 77L, 49L, 61L, 62L, 57L, 
67L, 16L, 65L, 75L, 45L, 16L, 55L, 17L, 64L, 67L, 56L, 52L, 63L, 
10L, 62L, 14L, 66L, 68L, 15L, 13L, 43L, 47L, 55L, 69L, 21L, 67L, 
34L, 52L, 15L, 31L, 64L, 55L, 44L, 13L, 48L, 71L, 64L, 13L, 25L, 
34L, 50L, 61L, 70L, 33L, 57L, 51L, 46L, 57L, 69L, 46L, 8L, 11L, 
46L, 71L, 33L, 38L, 56L, 17L, 29L, 28L, 6L)), row.names = c(NA, 
-325L), class = c("tbl_df", "tbl", "data.frame"))

To determine, if a linear or polynomial is a better fit for my data, I used the following:

#test which model is best fit
lm1 <- lm(Value~ Age, data= DF)
lm2 <- lm(Value~ poly(Age,2, raw=T), data=DF)
lm3 <- lm(Value~ poly(Age,3, raw=T), data=DF) # best model?
lm4 <- lm(Value~ poly(Age,4, raw=T), data=DF)

And tested which model was best with anova comparison.

anova(lm1, lm2, test = "Chisq")
anova(lm2, lm3, test = "Chisq")
anova(lm3, lm4, test = "Chisq")

The third model yields:

Call:
lm(formula = Value ~ poly(Age, 3, raw = T), data = DF)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.09892 -0.02047  0.00648  0.02405  0.06287 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.413e-01  1.038e-02  32.894  < 2e-16 ***
poly(Age, 3, raw = T)1 -3.763e-03  1.006e-03  -3.740 0.000218 ***
poly(Age, 3, raw = T)2  1.126e-04  2.581e-05   4.364 1.72e-05 ***
poly(Age, 3, raw = T)3 -9.280e-07  1.915e-07  -4.846 1.96e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.03216 on 321 degrees of freedom
Multiple R-squared:  0.09686,   Adjusted R-squared:  0.08842 
F-statistic: 11.48 on 3 and 321 DF,  p-value: 3.628e-07

I can then plot the following:

ggplot(DF) +
  geom_jitter(aes(x = Age, y = Value), alpha = 0.5) +
  geom_smooth(aes(x=Age, y=Value), method='lm', formula = y ~ poly(x,3))

The plot suggests that older adults have less Value than younger adults. To test if this is representative of a general population, I performed a bootstrap and included the cubic polynomial fit:

boot_lm = do(10000) * lm(Value~ poly(Age, 3, raw = T), data=mosaic::resample(DF))

#see confident interval (95%)
confint(boot_lm, level = 0.95)

This provides a 95% CI of:

                    name         lower         upper level     method      estimate
1              Intercept  3.185234e-01  3.650911e-01  0.95 percentile  3.224364e-01
2 poly.Age..3..raw...T.1 -6.027114e-03 -1.609017e-03  0.95 percentile -1.759840e-03
3 poly.Age..3..raw...T.2  5.801781e-05  1.691263e-04  0.95 percentile  5.910076e-05
4 poly.Age..3..raw...T.3 -1.333764e-06 -5.369094e-07  0.95 percentile -5.276421e-07
5                  sigma  2.954775e-02  3.425427e-02  0.95 percentile  3.044845e-02
6              r.squared  5.060619e-02  1.740023e-01  0.95 percentile  6.859062e-02
7                      F  5.703494e+00  2.254032e+01  0.95 percentile  7.879668e+00

My questions are:

Did I correctly identify that a poly(Age,3) model was most appropriate?
If so was the polynomial correctly added to the bootstrap?
Lastly, can I state something like:
- "We are 95% confident that in the true population, adults show a
  non-linear trend in Value overtime, with a larger decline in late life."

Best Answer

Did I correctly identify that a poly(Age,3) model was most appropriate?

I don't think so. There may be some issues of multiple testing here because you've done several likelihood ratio tests.

A better approach would be to have a linear model vs a spline model. Here is how to do this. I'll fit a model using natural splines with knots placed at the percentiles 5, 27.5, 50, 72.5, and 95 as per Frank Harrell's suggestions in his book Regression Modelling Strategies.

library(splines)

# Place the knots at the indicated quantiles, as per Frank Harrell's suggestions

knot_locs = quantile(d$Age, c(0.05, 0.275, 0.5, 0.725, 0.95))
fit1 = lm(Value ~ Age, data = d)
fit2 = lm(Value ~ ns(Age, knots = knot_locs), data = d)

anova(fit1, fit2)
Analysis of Variance Table

Model 1: Value ~ Age
Model 2: Value ~ ns(Age, knots = knot_locs)
  Res.Df     RSS Df Sum of Sq      F   Pr(>F)    
1    323 0.36426                                 
2    318 0.31802  5  0.046232 9.2456 3.12e-08 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Using a spline is a lower bias way to estimate a non-linear effect while keeping complexity of the model low. Here, I've used 5 knots for the spline, but you are free to use fewer or more. The F test here favors the spline model. We can plot it easily.

library(tidyverse)
library(modelr)

data_grid(d, Age) %>% 
  add_predictions(fit2) %>% 
  ggplot(aes(Age, pred))+
  geom_point(data=d, aes(Age, Value))+
  geom_line(color = 'red', size=1)

I think this approach, should the assumptions of the linear model be met (looks like they are to me from residual diagnostics), would eliminate the need for bootstrapping.

Lastly, can I state something like:

"We are 95% confident that in the true population, adults show a non-linear trend in Value overtime, with a larger decline in late life."

Uh...no, that is not typically something we would say. I might say something like...

"A spline model with 5 degrees of freedom is favoured over a model with a linear effect of age (F Value 9.24 on 6 and 318 degrees of freedom, p<0.001). Our model estimates that the expected Value should decline as individuals become older than 60".

Making statements about the decline specifically is...doable, but may take some care. I think here is where I would bootstrap and perhaps look at the derivative of the models. You could say "In X bootstrap replications, Y% of models had a negative derivative after age Z" or something, but I'm not confident. In any case, you would just have to be very careful about what you say. I'm not sure it would be worth it.

Edit: I agree with Whuber that the bootstrap is not necessary and that a joint confidence region gives information about the trend, but disagree that a confidence band is a good way to visualize the uncertainty about this particular problem. Shown below is a plot of the confidence band. Note that the model is more uncertain about the prediction in the right tail, that is not surprising. But from this plot alone, we can't tell if that increase in uncertainty is consistent with all models from the sampling distribution having negative derivative in the tail. For that, the bootstrap could help.

Here, I have replayed the model fit (including selection of knots, at the indicated percentiles) 250 times and plotted only the relevant part of the model. The overwhelming majority of samples have negative derivative, but a few don't. This plot, I argue, is more informative than the confidence band alone because we could visualize (and if we wanted to, estimate) how many bootstrap samples have negative first derivative in the tail. I leave that to you should you find that approach enticing.

Code


library(splines)
library(tidyverse)
library(rsample)
library(modelr)


d = # Your data here

do_fit<-function(data){
  model_data = assessment(data)
  knot_locs = quantile(model_data$Age, c(0.05, 0.275, 0.5, 0.725, 0.95))
  model = lm(Value ~ ns(Age, knots = knot_locs), data = model_data)
  model
}

do_pred<-function(data, model){
  data_grid(data, Age=seq(60, 100, 0.1))  %>% 
    add_predictions(model) 
}

# Estimate the derivative using finite differences
do_deriv<-function(data, model){
 data_grid(data, Age=seq(60, 100, 0.1))  %>% 
    add_predictions(model) %>% 
    mutate(deriv = (pred - lag(pred))/0.1)
}


straps = bootstraps(d, times=250) %>% 
  mutate(
    fits = map(splits, do_fit),
    preds = map(fits, ~do_pred(d, .x)),
    deriv = map(fits, ~do_deriv(d, .x))
  )


straps %>% 
  unnest(preds) %>% 
  ggplot(aes(Age, pred, group=id))+
  geom_line(alpha = 0.5)
  


straps %>% 
  unnest(deriv) %>% 
  ggplot(aes(Age, deriv, group=id))+
  geom_line(alpha = 0.5)
```

Related Solutions

Solved – How to interpret Bootstrap

The intuitive idea behind the bootstrap is this: if your original dataset was a random draw from the full population, then if you take subsample from the sample (with replacement), then that too represents a draw from the full population. You can then estimate your model on all of those bootstrapped datasets. This gives you a large number of estimates and so you can e.g. look at the standard deviations of your estimates - it turns out that often this gives a good guess of the standard error of the estimates. Actually, the standard error of the estimates can be thought of excactly as this if you take the many datasets from the true population.

Suppose for example there is one outlier in your dataset: then in many of your bootstrapped datasets that observation is not included and so for those datasets, you see the estimated coefficients change by a lot.

Similarly, you can look at the F statistic for each of the bootstrap datasets. You could for example see how many times the model was rejected. But I am not sufficiently familiar with SPSS to know what it reports as the F stat: is it the average F statistic?

Solved – Correct way to use polynomial regression in Python

Here's how I go about it in pure sklearn. There are probably ways to improve this workflow.

First I made a tranformer that simply selects one column from a DataFrame or matrix:

class ColumnSelector(object):

    def __init__(self, idxs):
        self.idxs = np.asarray(idxs)

    # Fit here doesn't need to do anything.  We already know the indices of the columns
    # we want to keep.
    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        # Need to teat pandas data frames and numpy arrays slightly differently.
        if isinstance(X, pd.DataFrame):
            return X.iloc[:, self.idxs]
        return X[:, self.idxs]

Then I made a PolynomialExpansion class

class PolynomialExpansion(object):

    def __init__(self, degree):
        self.degree = degree

    def fit(self, *args, **kwargs):
        return self

    def transform(self, X, **transform_params):
        # Initialize our return value as a matrix of all zeros.
        # We are going to overwrite all of these zeros in the code below.
        X_poly = np.zeros((X.shape[0], self.degree))
        # The first column in our transformed matrix is just the vector we started with.
        X_poly[:, 0] = X.squeeze()
        # Cleverness Alert:
        # We create the subsequent columns by multiplying the most recently created column
        # by X.  This creates the sequence X -> X^2 -> X^3 -> etc...
        for i in range(1, self.degree):
            X_poly[:, i] = X_poly[:, i-1] * X.squeeze()
        return X_poly

Then to use it, I combine it with Pipelines and FeatureUnions. This pipeline uses the arsenic data from Gelman and Hill

wells_pipeline = Pipeline([
    ('polynomial_expansions', FeatureUnion([
        ('arsenic_quadratic', Pipeline([
            ('arsenic_selector', ColumnSelector([0])),
            ('quadratic_expansion', PolynomialExpansion(2))
        ])),
        ('distance_quadratic', Pipeline([
            ('distance_selector', ColumnSelector([1])),
            ('quadratic_expansion', PolynomialExpansion(2))         
        ]))
    ])),
    ('regression', LogisticRegression())
])

Another option is to use patsy, which allows you to specify model formulas, as in R.