Cox Model – How to Analyze Cox Survival Data with Time-Varying Continuous Variables and Fixed Covariates in R

cox-modelrregressionsurvivaltime-varying-covariate

I am interested in the paper entitled "Generating survival times to simulate Cox proportional hazards models with time-varying covariates " (Stat Med. 2012 Dec 20;31(29):3946-58.),and want to simulate such kind of survival data. The R code is shown below. However, the beta(t) for time-varying covariate is not as defined in the beginning. I have struggled for a longtime but cannot figure out what's wrong with my code. Can anyone help me to figure it out?

N=1000 #number of subjects
k=0.3 #assuming the time varying covariates is proporitonal to time x2=k*t
lambda=0.01
betat=0.3 
beta=-0.6
rateC=0.01
#time fixed variable
x1 <- sample(x=c(0, 1), size=N, replace=TRUE, prob=c(0.5, 0.5))
  #Exponential latent event times
  u <- runif(n=N)
  Tlat <- log(1+betat*k*(-log(u))/(lambda*exp(beta*x1)))/betat*k
  Cen<- rexp(n=N, rate=rateC)#censoring times
  # follow-up times and event indicators
  time <- pmin(Tlat, Cen)
  status <- as.numeric(Tlat <= Cen)
  # data set
  df.tfixed<-data.frame(id=1:N,
         time=time,
         status=status,
         x1=x1)
##time dependent continuous variable
ntp<-sample(1:6,N,replace=T)#number of follow up time points
mat<-matrix(ncol=3,nrow=1)
i=0
for(n in ntp){
i=i+1
ft<-runif(n,min=0,max=df.tfixed$time[i])
ft<-sort(ft)
seq<-rep(ft,each=2)
seq<-c(0,seq,df.tfixed$time[i])
matid<-cbind(matrix(seq,ncol=2,nrow=n+1,byrow=T),i)
mat<-rbind(mat,matid)
}
df.td<-data.frame(mat[-1,])
colnames(df.td)<-c("start","stop","id")
df.td$x2<-k*df.td$start
#combine the two data frames 
df<-merge(df.td,df.tfixed,by="id")
df$status=0
df$status[cumsum(as.vector(table(df$id)))]<-1
fit <- coxph(Surv(start,stop, status) ~ x1+x2, data=df)
fit$coef
    x1         x2 
-0.6195906  1.3083348 

The output shows that the beta for x1 is consistent with what I have defined, but the coefficient for x2 is not what I have defined (betat=0.3).

Best Answer

I figure you are looking for the model in equation 4 in this article. Then this code will do

> library(survival)
> set.seed(747)
> 
> N <- 1e4 # number of subjects
> k <- 0.3 # assuming the time varying covariates is proporitonal to time x2=k*t
> lambda <- 0.01
> betat <- 0.3 
> beta <- -0.6
> rateC <- 0.01
> 
> #####
> # simulate 
> x1 <- as.numeric(runif(N) > .5) # covariate
> 
> # simulate outcome (NB: OP forgot parentheses)
> u <- runif(n = N)
> Tlat <- log(1 + betat*k*(-log(u))/(lambda*exp(beta*x1))) / (betat * k)
> 
> # censoring and status indicator
> Cen<- rexp(n = N, rate = rateC)
> 
> # make data.frame
> df <- data.frame(
+   time = pmin(Tlat, Cen), 
+   status = Tlat <= Cen, 
+   id = 1:N, 
+   x1 = x1, 
+   x2 = rep(k, N))
> 
> # fit model -- cannot use coxph for x2 due to the non-parametric intercept
> fit <- coxph(Surv(time, status) ~ x1, data = df)
> fit # coefficient for x1 is right
Call:
coxph(formula = Surv(time, status) ~ x1, data = df)

      coef exp(coef) se(coef)   z      p
x1 -0.6083    0.5442   0.0234 -26 <2e-16

Likelihood ratio test=675  on 1 df, p=0
n= 10000, number of events= 7891 
> 
> # # won't work as we try to estimate a term that is equal for all individuals 
> # # at each time t
> # fit <- coxph(Surv(time, status) ~ x1 + tt(x2), data = df, 
> #              tt = function(x, t, ...) x * t)
> 
> # The non-parametric cumulative hazard though match with what we expect
> base <- basehaz(fit, centered = FALSE)
> plot(base$time, base$hazard, type = "l")
> lines(
+   base$time, 
+   lambda / (betat * k) * (exp(betat * k * base$time) - 1), col = "red")

enter image description here

> #####
> # estimate parametric model
> neg_loglike_func <- function(b){
+   # compute intermediates
+   lp1 <- exp(b[1] + b[2] * df$x1)    # time-invariant terms
+   lp2 <- exp(b[3] * df$x2 * df$time) # time-varying terms
+   log_haz <- log(lp1) + log(lp2)                  # instant hazard
+   cumhaz <- lp1 * (lp2 - 1)/(b[3] * df$x2 + 1e-8) # cumhaz
+   
+   # compute log likelihood terms
+   ll_terms <- ifelse(df$status == 1,  log_haz - cumhaz, -cumhaz)
+   -sum(ll_terms) # neg log likelihood
+ }
> 
> # fit model
> b <- c(0.01, -0.02, 0.01) # starting valus
> fit <- optim(b, neg_loglike_func)
> 
> exp(fit$par[1]) # lambda estimate
[1] 0.01035017
> fit$par[2:3]    # beta and betat estimate
[1] -0.6099467  0.2966224

You may be able to avoid the optim function above by e.g., using one of the parametric survival models in the rstpm2 package or polspline.

Before edits

I cannot see how what you do is related to the cited article. Further, I am struggling to see how x2 enters into the model. Maybe post equations for the model you are trying to simulate from?

What may help you is that all of your observation end up dying at the end it seems which seems odd given that you have # censoring times in the Cen object. I show this below

library(survival)
set.seed(747)

N=1000 # number of subjects
k=0.3 # assuming the time varying covariates is proporitonal to time x2=k*t
lambda=0.01
betat=0.3 
beta=-0.6
rateC=0.01


#####
# Time fixed variable
x1 <- sample(x=c(0, 1), size=N, replace=TRUE, prob=c(0.5, 0.5))

#Exponential latent event times
u <- runif(n=N)
Tlat <- log(1+betat*k*(-log(u)) / (lambda*exp(beta*x1))) / betat*k
Cen<- rexp(n=N, rate=rateC) #censoring times

# follow-up times and event indicators
time <- pmin(Tlat, Cen)
status <- as.numeric(Tlat <= Cen)
# data set
df.tfixed<-data.frame(id=1:N, time=time, status=status, x1=x1)


#####
# time dependent continuous variable
ntp<-sample(1:6,N,replace=T) #number of follow up time points
mat<-matrix(ncol=3,nrow=1)
i=0
for(n in ntp){
  i=i+1
  ft<-runif(n,min=0,max=df.tfixed$time[i])
  ft<-sort(ft)
  seq<-rep(ft,each=2)
  seq<-c(0,seq,df.tfixed$time[i])
  matid<-cbind(matrix(seq,ncol=2,nrow=n+1,byrow=T),i)
  mat<-rbind(mat,matid)
}
df.td<-data.frame(mat[-1,])
colnames(df.td)<-c("start","stop","id")
df.td$x2<-k*df.td$start


#combine the two data frames 
df<-merge(df.td,df.tfixed,by="id")
df$status=0
df$status[cumsum(as.vector(table(df$id)))]<-1

fit <- coxph(Surv(start,stop, status) ~ x1+x2, data=df)
fit$coef
>         x1         x2 
> -0.6040089  1.8471387 

# All observations dies exaclty once -- no censoring
all(tapply(df$status == 1, df$id, sum) == 1)
> [1] TRUE

It comes down to this line df$status[cumsum(as.vector(table(df$id)))]<-1.

Related Question