Regression Splines – Differences Between Natural Cubic Splines Implementation in R

logisticnonlinear regressionrregressionsplines

I compare the rcs() function of the rms package with the ns() function of the spline package with the following code. The final plot reveals that the for the rcs() function the 95% CI is zero at the reference value of age = 45 and that the 95% CI is much wider for the rcs() function. I did not expect such huge differences between both functions. Why are the results that different? Clinical interpretation might considerably differ depending on the chosen function.

Example #1 (Start)

##############################
# Load packages
##############################
library("Hmisc")
library("rms")
library("splines")
library("ggplot2")

###################################
# Test data
# https://www.rdocumentation.org/packages/rms/versions/6.3-0/topics/lrm
###################################
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
label(age)            <- 'Age'      # label is in Hmisc
label(cholesterol)    <- 'Total Cholesterol'
label(blood.pressure) <- 'Systolic Blood Pressure'
label(sex)            <- 'Sex'
units(cholesterol)    <- 'mg/dl'   # uses units.default in Hmisc
units(blood.pressure) <- 'mmHg'

# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))

# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
outcome <- ifelse(runif(n) < plogis(L), 1, 0)
cholesterol[1:3] <- NA   # 3 missings, at random

d <- data.frame(outcome=outcome, age=age, sex=sex, blood.pressure=blood.pressure, cholesterol=cholesterol)

###################################
# Analysis using lrm from rms and rcs
###################################

dd <- datadist(d); options(datadist='dd')

dd$limits["Adjust to","age"] <- 45

fit <- lrm(outcome ~ rcs(age, 4))
anova(fit) # age is not nonlinear
p2 <- Predict(fit, age, ref.zero=TRUE, fun=exp)
p2

x1 <- p2
names(x1) <- c("age", "OR", "lower", "upper")
x1$model <- "rcs"
head(x1)
describe(x1$age)

x1 <- data.frame(age=x1$age, OR=x1$OR, lower=x1$lower, upper=x1$upper, model="rms")
head(x1)

ggplot(p2) + ylab("Predicted odds ratio")   # or plot()

###################################
# Analysis using glm and ns
###################################
fit <- glm("outcome ~ ns(age,4)", family=binomial(link="logit"), data=d)

ggplot(data = d, aes(x = age, y = outcome)) +
  geom_point() +
  geom_smooth(method = "loess")


age_range <- data.frame(age = seq(from = 30, to = 74, by = 1))

ref <- predict(fit, newdata = data.frame(age=c(45)), se.fit=TRUE, type = "response", level = 0.95)$fit # set age=45 as reference value similar to "dd$limits["Adjust to","age"] <- 45"

predictions <- predict(fit, newdata = age_range, se.fit=TRUE, type = "response", level = 0.95)
x2 <- data.frame(age = age_range$age,
                      odds_ratio = exp(predictions$fit-ref),
                      lwr = exp(predictions$fit - 1.96 * predictions$se.fit-ref),
                      upr = exp(predictions$fit + 1.96 * predictions$se.fit-ref))
ggplot(data = x2, aes(x = age, y = odds_ratio)) +
  geom_line() +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2) +
  xlab("Age") +
  ylab("Predicted Odds Ratio")

names(x2)
names(x2) <- c("age", "OR", "lower", "upper")
x2$model <- "ns"
head(x2)
describe(x2$age)

x <- rbind(x1, x2)
head(x)
table(x$model)
ggplot(data=x, aes(x=age, y=OR, color=model)) +
  geom_line() +
  geom_ribbon(aes(ymin=lower, ymax=upper, color=model), alpha = 0.2) +
  xlab("Age") +
  ylab("Predicted Odds Ratio")

example #1 on odds ratio scale

resulting plot

Example #2

After adapting my starting example (#1) according to the some suggestions made by COOLSerdash [THANKS!!!] (i.e.
defining the knot positions and boundary knot positions for the ns() function
using the plogis function to plot the outcome on the probability scale
), I get the following plot.

Keeping 45 years as reference values lead to a cinched-in waist of the 95% CI for the lrm()/rcs()-approach but not for the glm()/ns()-approach. Why? What is the rationale for the cinched-in waist of the 95% CI in the lrm()/rcs()-approach?

When I keep also the odds ratio as scale, the plot looks quite similar as the one for the probability scale. The only difference in the figure of the 95% CI.

So the main discrepancy between rcs() and ns() in my first example is driven by the knot positions. This suggests that the knot positions may be crucial for the resulting clinical interpretation.

##############################
# Load packages
##############################
library("Hmisc")
library("rms")
library("splines")
library("ggplot2")

###################################
# Test data
# https://www.rdocumentation.org/packages/rms/versions/6.3-0/topics/lrm
###################################
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
label(age)            <- 'Age'      # label is in Hmisc
label(cholesterol)    <- 'Total Cholesterol'
label(blood.pressure) <- 'Systolic Blood Pressure'
label(sex)            <- 'Sex'
units(cholesterol)    <- 'mg/dl'   # uses units.default in Hmisc
units(blood.pressure) <- 'mmHg'

# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))

# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
outcome <- ifelse(runif(n) < plogis(L), 1, 0)
cholesterol[1:3] <- NA   # 3 missings, at random

d <- data.frame(outcome=outcome, age=age, sex=sex, blood.pressure=blood.pressure, cholesterol=cholesterol)

###################################
# Analysis using lrm from rms and rcs
###################################

dd <- datadist(d); options(datadist='dd')

dd$limits["Adjust to","age"] <- 45

fit <- lrm(outcome ~ rcs(age, 4), x=TRUE, y=TRUE)
specs(fit)
anova(fit) # age is not nonlinear
p2 <- Predict(fit, age, ref.zero=TRUE, fun=plogis)
p2

x1 <- p2
names(x1) <- c("age", "OR", "lower", "upper")
x1$model <- "rcs"

x1 <- data.frame(age=x1$age, OR=x1$OR, lower=x1$lower, upper=x1$upper, model="rms")

ggplot(p2) + ylab("Predicted odds ratio")   # or plot()

###################################
# Analysis using glm and ns
###################################
# fit <- glm("outcome ~ ns(age,4)", family=binomial(link="logit"), data=d)
fit <- glm("outcome ~ ns(age, knots = c(46.387, 53.87), Boundary.knots = c(33.899, 67.177))", family=binomial(link="logit"), data=d)


age_range <- data.frame(age = seq(from = 27.02467, to = 74.77199, length = 100))

ref <- predict(fit, newdata = data.frame(age=c(45)), se.fit=TRUE, type = "link", level = 0.95)$fit # set age=45 as reference value similar to "dd$limits["Adjust to","age"] <- 45"

predictions <- predict(fit, newdata = age_range, se.fit=TRUE, type = "link", level = 0.95)
x2 <- data.frame(age = age_range$age,
                 odds_ratio = plogis(predictions$fit-ref),
                 lwr = plogis(predictions$fit - 1.96 * predictions$se.fit-ref),
                 upr = plogis(predictions$fit + 1.96 * predictions$se.fit-ref))
ggplot(data = x2, aes(x = age, y = odds_ratio)) +
  geom_line() +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2) +
  xlab("Age") +
  ylab("Predicted Odds Ratio")

names(x2) <- c("age", "OR", "lower", "upper")
x2$model <- "ns"

x <- rbind(x1, x2)
head(x)
table(x$model)
ggplot(data=x, aes(x=age, y=OR, color=model)) +
  geom_line() +
  geom_ribbon(aes(ymin=lower, ymax=upper, color=model), alpha = 0.2) +
  xlab("Age") +
  ylab("Predicted Odds Ratio")

example #2 on probability scale

example #2 on probability scale

example #2 on odds ratio scale

example #2 on odds ratio scale

Example #3

@COOLSerdash: Thanks again for pointing me to this mistake. Would it be correct to say: If I divide the odds of age = 45 by the odds of age = 50, then I get the odds ratio for this age difference?
The odds for age=45 are 1.
As shown below, the odds for age=45 are approximately 1.238503. This is almost what summary.rms returns:

> fit <- lrm(outcome ~ rcs(age, 4), x=TRUE, y=TRUE)
> summary(fit, age=c(45,50))
             Effects              Response : outcome 

 Factor      Low High Diff. Effect  S.E.     Lower 0.95 Upper 0.95
 age         45  50   5     0.21177 0.076246 0.062329   0.36121   
  Odds Ratio 45  50   5     1.23590       NA 1.064300   1.43510

And you are also correct that fit <- glm("outcome ~ ns(age, 3)", family=binomial(link="logit"), data=d) results in almost the same predictions as fit <- lrm(outcome ~ rcs(age, 4), x=TRUE, y=TRUE) as the following plot is shows.

@EdM: Thanks for explaining the issue with the cinched-in waist of the lrm/rcs-approach. I have gotten a rough idea, even though I don't fully understand it. Which kind of 95% CI do you recommand and why ("pointwise CI as a function of age" versus "relative to 45 years of age")?

I think my initial question "I did not expect such huge differences between both functions. Why are the results that different?" is answered by "I used the wrong type in
predict {stats}. I should have used type="link" and not type="response" to plot the predicted odds. However, thanks to your great help, I have stumbled upon many other points and have learned a lot!

enter image description here

Best Answer

There are differences in the number of knots and the default knot placement. Inspect where rcs places its four knots using specs(fit):

lrm(formula = outcome ~ rcs(age, 4), x = TRUE, y = TRUE)

    Label Assumption Parameters                  d.f.
age Age   rcspline    33.899 46.387 53.87 67.177 3  

ns on the other hand places the five total knots (the number of knots is $\text{df} + 1$, see here for the relationship between the degrees of freedom and the number of knots) at

attributes(ns(d$age, 4))[c("knots", "Boundary.knots")]

$knots
     25%      50%      75% 
43.53104 50.25048 56.67729 

$Boundary.knots
[1] 14.43864 89.39746

So ns places the internal knots at the 25th, 50th and 75th percentile whereas the boundary knots (the outermost knots) are placed at the minimum and maximum.

To get the same model using ns, you have to manually specify the knots, including the boundary knots: ns(age, knots = c(46.387, 53.87), Boundary.knots = c(33.899, 67.177)).

If you then plot the predictions of both models on the probability scale, they are identical (using your simulated data as example):

PredictedProbs

R code:

##############################
# Load packages
##############################
library("Hmisc")
library("rms")
library("splines")
library("ggplot2")

###################################
# Test data
# https://www.rdocumentation.org/packages/rms/versions/6.3-0/topics/lrm
###################################
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
label(age)            <- 'Age'      # label is in Hmisc
label(cholesterol)    <- 'Total Cholesterol'
label(blood.pressure) <- 'Systolic Blood Pressure'
label(sex)            <- 'Sex'
units(cholesterol)    <- 'mg/dl'   # uses units.default in Hmisc
units(blood.pressure) <- 'mmHg'

# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))

# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
outcome <- ifelse(runif(n) < plogis(L), 1, 0)
cholesterol[1:3] <- NA   # 3 missings, at random

d <- data.frame(outcome=outcome, age=age, sex=sex, blood.pressure=blood.pressure, cholesterol=cholesterol)

###################################
# Analysis using lrm from rms and rcs
###################################

dd <- datadist(d); options(datadist='dd')

fit <- lrm(outcome ~ rcs(age, 4), y = TRUE, x = TRUE)

p2 <- Predict(fit, age, fun = plogis)

x1 <- p2
names(x1) <- c("age", "prob", "lower", "upper")

x1 <- data.frame(age=x1$age, prob=x1$prob, lower=x1$lower, upper=x1$upper, model="rms")

###################################
# Analysis using glm and ns
###################################

fit2 <- glm(outcome ~ ns(age, knots = c(46.387, 53.87), Boundary.knots = c(33.899, 67.177)), family=binomial(link="logit"), data=d)

age_range <- data.frame(age = seq(from = 27.02467, to = 74.77199, length = 100))

predictions <- predict(fit2, newdata = age_range, se.fit=TRUE, type = "link", level = 0.95)
x2 <- data.frame(age = age_range$age,
                 prob = plogis(predictions$fit),
                 lower = plogis(predictions$fit - 1.96 * predictions$se.fit),
                 upper = plogis(predictions$fit + 1.96 * predictions$se.fit), 
model = "ns")
    
x <- rbind(x1, x2)

ggplot(data=x, aes(x=age, y=prob, color=model)) +
  geom_line() +
  geom_ribbon(aes(ymin=lower, ymax=upper, color=model), alpha = 0.2) +
  xlab("Age") +
  ylab("Probability")