# Survival analysis: Cox proportional hazards model

## Install required packages

survival

wants <- c("survival")
has   <- wants %in% rownames(installed.packages())
if(any(!has)) install.packages(wants[!has])

## Simulated right-censored event times with Weibull distribution

Time independent covariates with proportional hazards

$$T = (-\ln(U) \, b \, e^{-\bf{X} \bf{\beta}})^{\frac{1}{a}}$$, where $$U \sim \mathcal{U}(0, 1)$$

set.seed(123)
N      <- 180
P      <- 3
sex    <- factor(sample(c("f", "m"), N, replace=TRUE))
X      <- rnorm(N, 0, 1)
IV     <- factor(rep(LETTERS[1:P], each=N/P))
IVeff  <- c(0, -1, 1.5)
Xbeta  <- 0.7*X + IVeff[unclass(IV)] + rnorm(N, 0, 2)
weibA  <- 1.5
weibB  <- 100
U      <- runif(N, 0, 1)
eventT <- ceiling((-log(U)*weibB*exp(-Xbeta))^(1/weibA))
obsLen <- 120
censT  <- rep(obsLen, N)
obsT   <- pmin(eventT, censT)
status <- eventT <= censT
dfSurv <- data.frame(obsT, status, sex, X, IV)

Survival data in counting process (start-stop) notation.

library(survival)
dfSurvCP <- survSplit(dfSurv, cut=seq(30, 90, by=30), end="obsT",
event="status", start="start", id="ID", zero=0)

## Fit the Cox proportional hazards model

Assumptions

$\begin{equation*} \begin{array}{rclcl} \ln \lambda(t) &=& \ln \lambda_{0}(t) + \beta_{1} X_{1} + \dots + \beta_{p} X_{p} &=& \ln \lambda_{0}(t) + \bf{X} \bf{\beta}\\ \lambda(t) &=& \lambda_{0}(t) \, e^{\beta_{1} X_{1} + \dots + \beta_{p} X_{p}} &=& \lambda_{0}(t) \, e^{\bf{X} \bf{\beta}}\\ S(t) &=& S_{0}(t)^{\exp(\bf{X} \bf{\beta})} &=& \exp\left(-\Lambda_{0}(t) \, e^{\bf{X} \bf{\beta}}\right)\\ \Lambda(t) &=& \Lambda_{0}(t) \, e^{\bf{X} \bf{\beta}} & & \end{array} \end{equation*}$

library(survival)
(fitCPH <- coxph(Surv(obsT, status) ~ X + IV, data=dfSurv))
Call:
coxph(formula = Surv(obsT, status) ~ X + IV, data = dfSurv)

coef exp(coef) se(coef)      z        p
X    0.49304   1.63728  0.09374  5.260 1.44e-07
IVB -0.82245   0.43935  0.21081 -3.901 9.56e-05
IVC  0.37670   1.45746  0.19336  1.948   0.0514

Likelihood ratio test=51.62  on 3 df, p=3.617e-11
n= 180, number of events= 157 

Use counting process data format.

coxph(Surv(start, obsT, status) ~ X + IV, data=dfSurvCP)
summary(fitCPH)
# not shown

## Assess model fit

### AIC

library(survival)
extractAIC(fitCPH)
[1]    3.000 1367.667

### Model comparisons

Restricted model without factor IV

library(survival)
fitCPH1 <- coxph(Surv(obsT, status) ~ X, data=dfSurv)
anova(fitCPH1, fitCPH)          # model comparison
Analysis of Deviance Table
Cox model: response is  Surv(obsT, status)
Model 1: ~ X
Model 2: ~ X + IV
loglik  Chisq Df P(>|Chi|)
1 -698.77
2 -680.83 35.867  2 1.628e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Estimate survival-function und cumulative baseline hazard

### Survival function

For average pseudo-observation with covariate values equal to the sample means. For factors: means of dummy-coded indicator variables.

library(survival)                # for survfit()
(CPH <- survfit(fitCPH))
Call: survfit(formula = fitCPH)

n  events  median 0.95LCL 0.95UCL
180     157      14      11      19 
quantile(CPH, probs=c(0.25, 0.5, 0.75), conf.int=FALSE)
25 50 75
4 14 39 

More meaningful: Estimated survival function for new specific data

dfNew  <- data.frame(sex=factor(c("f", "f"), levels=levels(dfSurv$sex)), X=c(-2, -2), IV=factor(c("A", "C"), levels=levels(dfSurv$IV)))

library(survival)
CPHnew <- survfit(fitCPH, newdata=dfNew)

par(mar=c(5, 4.5, 4, 2)+0.1, cex.lab=1.4, cex.main=1.4)
plot(CPH, main=expression(paste("Cox PH-estimate ", hat(S)(t), " with CI")),
xlab="t", ylab="Survival", lwd=2)
lines(CPHnew$time, CPHnew$surv[ , 1], lwd=2, col="blue")
lines(CPHnew$time, CPHnew$surv[ , 2], lwd=2, col="red")
legend(x="topright", lwd=2, col=c("black", "blue", "red"),
legend=c("pseudo-observation", "sex=f, X=-2, IV=A", "sex=f, X=-2, IV=C"))

### Cumulative hazard

library(survival)
with(CPHnew, head(data.frame(time, cum_haz=-log(surv)), n=4))
  time  cum_haz.1  cum_haz.2
1    1 0.02700124 0.03935325
2    2 0.05287319 0.07706061
3    3 0.09865382 0.14378411
4    4 0.12645421 0.18430209
plot(CPHnew, fun="cumhaz", col=c("blue", "red"),
main=expression(paste("Cox PH-Schätzung ",
hat(Lambda)[g](t))),
ylab="cumulative hazard", lwd=2)
legend(x="bottomright", lwd=2, col=c("blue", "red"),
legend=c("sex=f, X=-2, IV=A", "sex=f, X=-2, IV=C"))

## Model diagnostics

### Proportional hazards assumption

Complementary log-log plot for categorized predictor X.

library(survival)                # for survfit()
dfSurv <- transform(dfSurv, Xcut=cut(X, breaks=c(-Inf, median(X), Inf),
labels=c("lo", "hi")))
KMiv   <- survfit(Surv(obsT, status) ~ IV,   type="kaplan-meier", data=dfSurv)
KMxcut <- survfit(Surv(obsT, status) ~ Xcut, type="kaplan-meier", data=dfSurv)

plot(KMiv, fun="cloglog", main="cloglog-Plot for IV1", xlab="ln t",
ylab=expression(ln(-ln(hat(S)[g](t)))), col=c("black", "blue", "red"), lty=1:3)

legend(x="topleft", col=c("black", "blue", "red"), lwd=2, lty=1:3, legend=LETTERS[1:3])
plot(KMxcut, fun="cloglog", main="cloglog-Plot for Xcut", xlab="ln t",
ylab=expression(ln(-ln(hat(S)[g](t)))), col=c("black", "blue"), lty=1:2)

legend(x="topleft", col=c("black", "blue"), lwd=2, lty=1:2, legend=c("lo", "hi"))

Test based on scaled Schoenfeld-residuals

library(survival)                      # for cox.zph()
(czph <- cox.zph(fitCPH))
       chisq df    p
X      0.929  1 0.34
IV     3.062  2 0.22
GLOBAL 5.037  3 0.17

Plot scaled Schoenfeld-residuals against covariates

par(mfrow=c(1, 2), cex.main=1.4, cex.lab=1.4)
plot(czph)

### Influential observations

dfbetas <- residuals(fitCPH, type="dfbetas")

par(mfrow=c(2, 2), cex.main=1.4, cex.lab=1.4)
plot(dfbetas[ , 1], type="h", main="DfBETAS for X",    ylab="DfBETAS", lwd=2)
plot(dfbetas[ , 2], type="h", main="DfBETAS for IV-B", ylab="DfBETAS", lwd=2)
plot(dfbetas[ , 3], type="h", main="DfBETAS for IV-C", ylab="DfBETAS", lwd=2)

### Linearity of log hazard

Plot martingale-residuals against continuous covariate with a nonparametric LOESS-smoother

resMart <- residuals(fitCPH, type="martingale")
plot(dfSurv$X, resMart, main="Martingale-residuals for X", xlab="X", ylab="Residuen", pch=20) lines(loess.smooth(dfSurv$X, resMart), lwd=2, col="blue")
legend(x="bottomleft", col="blue", lwd=2, legend="LOESS fit", cex=1.4)

## Predicted hazard ratios

Hazard ratio relative to an average pseudo-observation with covariate values equal to the sample means. For factors: means of dummy-coded indicator variables.

library(survival)
predRes <- predict(fitCPH, type="risk")
head(predRes, n=10)
 [1] 1.9166403 1.5389801 1.3210722 0.8617067 2.2969713 0.8735328 3.4527637
[8] 2.5002058 1.0455378 0.7079919

Estimated global survival function.

library(survival)
Shat1 <- survexp(~ 1, ratetable=fitCPH, data=dfSurv)
with(Shat1, head(data.frame(time, surv), n=4))
  time      surv
1    1 0.9269729
2    2 0.8643413
3    3 0.7681378
4    4 0.7173365

Estimated survival function per group.

library(survival)
Shat2 <- survexp(~ IV, ratetable=fitCPH, data=dfSurv)
with(Shat2, head(data.frame(time, surv), n=4))
  time      IV.A      IV.B      IV.C
1    1 0.9265656 0.9633824 0.8909708
2    2 0.8626937 0.9299972 0.8003328
3    3 0.7630677 0.8746553 0.6666905
4    4 0.7097748 0.8431836 0.5990511

## Detach (automatically) loaded packages (if possible)

try(detach(package:survival))