如何正确适应Cox回归与coxphf? (Firth惩罚R中的最大似然偏差减少方法)

时间:2018-01-31 08:37:35

标签: r survival-analysis cox-regression multivariate-testing

我正在努力适应Cox回归(需要包装:生存),并遇到了问题。当我尝试将“常规”Cox回归模型与我的数据拟合时,我收到错误消息“X矩阵被认为是单数;变量9”(如果我删除变量9,则问题变为变量8)。据我所知,问题出现了,因为有太多这些变量的患者有相同的事件(我相信另一个问题,这被称为“完美分类”)

这就是为什么我试图使用coxphf函数(同名包)来拟合Cox模型,因为这应该通过对Cox回归使用“Firth的惩罚最大似然偏差减少方法”来解决问题。但这似乎也不起作用,直到我将“maxit”从默认值50增加到1000并从等式中删除“undefined”变量。但是如果我从我的数据集中删除未定义的变量(它只有1个人),那么该模型似乎不再起作用。

所以我的问题是,我该如何解决这个问题?从数据集中删除整个变量(因此是1个人)是否合适/必要?可能会犯一些非常明显的错误,但请耐心等待,因为我绝对没有统计背景。非常感谢你提前。

我包含了以下示例数据,以及我尝试拟合Cox模型。 所以这就是我通过从模型中省略“未定义”变量来设法让模型工作的方式:

# load packages
library("survival")
library ("coxphf")

example<-structure(list(Pat.nr. = c(1L, 2L, 5L, 7L, 8L, 10L, 13L, 14L, 
15L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 26L, 28L, 29L, 30L, 31L, 
32L, 33L, 34L, 35L, 36L, 37L, 39L, 41L, 42L, 44L, 45L, 46L, 47L, 
48L, 49L, 50L, 52L, 53L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 
67L, 68L, 69L), OS = c(1.6, 34.6, 1.5, 35.8, 7.7, 38.6, 37.6, 
8.6, 0.6, 5.7, 0.6, 43.9, 25.8, 7.3, 28.1, 43.8, 12.8, 18.5, 
36.1, 43.1, 15.4, 37.6, 8.6, 2.7, 10.2, 8.1, 37.3, 25.3, 3.7, 
26.1, 41.2, 5.9, 15.5, 56.8, 29.5, 52.1, 5.4, 54.8, 53.5, 16.6, 
49.2, 53.8, 8.5, 56, 7.4, 28, 3.3, 38, 55.7, 0.4), Event = c(1L, 
0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
1L), Age = c(68.41, 54.9, 44.44, 64.14, 68.86, 62.93, 40.76, 
31.06, 42.97, 69.16, 47.39, 60.14, 27.9, 56.57, 19.63, 47.75, 
45.58, 66.22, 43.73, 45.34, 38.83, 54.46, 48.91, 70.3, 60.51, 
68.55, 63.18, 55.89, 68.27, 57.25, 56.17, 60.83, 74.42, 71.3, 
40.36, 50.85, 59.61, 50.14, 45.77, 19.34, 56.32, 53.38, 70.7, 
55.25, 56.05, 44.06, 51.36, 69.37, 69.71, 75.44), Favorable = c(0L, 
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 
0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L), Intermediate = c(0L, 2L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 2L, 
0L, 2L, 2L, 0L, 2L, 0L, 2L, 2L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 
2L, 0L, 0L, 2L, 2L, 2L, 0L, 2L, 2L, 0L, 0L, 2L, 0L, 2L, 0L, 0L, 
0L, 2L, 2L, 0L, 2L, 0L, 2L, 2L), Adverse = c(1L, 0L, 0L, 1L, 
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 
1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Undefined = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L)), .Names = c("Pat.nr.", "OS", "Event", "Age", "Favorable", 
"Intermediate", "Adverse", "Undefined"), row.names = c(NA, 50L
), class = "data.frame")


#row&columns
n_row <- dim(example)[1]
n_col <- dim(example)[2]

#variables:
OS <- c(example[1:n_row,2])
Event <- c(example[1:n_row,3])
age <- c(example[1:n_row,4])
Favorable <- c(example[1:n_row,5])
Intermediate <-  c(example[1:n_row,6])
Adverse <- c(example[1:n_row,7])
Undefined <-  c(example[1:n_row,8])


# dependent and independent variables
y <- Surv(OS, Event)
x <- cbind(age, Favorable, Intermediate, Adverse, Undefined)  
example <- data.frame(cbind(x,y)) 

#coxphf with Firth's Penalized Likelihood --> which doesn't seem to work
cox2<-coxphf(data=example,y~x, firth=TRUE, pl=TRUE, maxit=1000)
summary(cox2)

#coxphf with Firth's Penalized Likelihood (without Variable "Undefined") --> this works
cox2<-coxphf(data=example,y~age+Favorable+Intermediate+Adverse, firth=TRUE, pl=TRUE, maxit=1000)
summary(cox2)

在这里,我修改了数据集以不包含未定义的变量(模型不再起作用):

example1<-structure(list(Pat.nr. = c(1L, 2L, 5L, 7L, 8L, 10L, 13L, 14L, 
15L, 17L, 19L, 20L, 21L, 22L, 23L, 26L, 28L, 29L, 30L, 31L, 32L, 
33L, 34L, 35L, 36L, 37L, 39L, 41L, 42L, 44L, 45L, 46L, 47L, 48L, 
49L, 50L, 52L, 53L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 67L, 
68L, 69L, 72L), OS = c(1.6, 34.6, 1.5, 35.8, 7.7, 38.6, 37.6, 
8.6, 0.6, 5.7, 43.9, 25.8, 7.3, 28.1, 43.8, 12.8, 18.5, 36.1, 
43.1, 15.4, 37.6, 8.6, 2.7, 10.2, 8.1, 37.3, 25.3, 3.7, 26.1, 
41.2, 5.9, 15.5, 56.8, 29.5, 52.1, 5.4, 54.8, 53.5, 16.6, 49.2, 
53.8, 8.5, 56, 7.4, 28, 3.3, 38, 55.7, 0.4, 2.8), Event = c(1L, 
0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 
0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 
1L), Age = c(68.41, 54.9, 44.44, 64.14, 68.86, 62.93, 40.76, 
31.06, 42.97, 69.16, 60.14, 27.9, 56.57, 19.63, 47.75, 45.58, 
66.22, 43.73, 45.34, 38.83, 54.46, 48.91, 70.3, 60.51, 68.55, 
63.18, 55.89, 68.27, 57.25, 56.17, 60.83, 74.42, 71.3, 40.36, 
50.85, 59.61, 50.14, 45.77, 19.34, 56.32, 53.38, 70.7, 55.25, 
56.05, 44.06, 51.36, 69.37, 69.71, 75.44, 71.05), Favorable = c(0L, 
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 
0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 
1L), Intermediate = c(0L, 2L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 2L, 
2L, 2L, 0L, 2L, 0L, 2L, 2L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 2L, 
0L, 0L, 2L, 2L, 2L, 0L, 2L, 2L, 0L, 0L, 2L, 0L, 2L, 0L, 0L, 0L, 
2L, 2L, 0L, 2L, 0L, 2L, 2L, 0L), Adverse = c(1L, 0L, 0L, 1L, 
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 
0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L)), .Names = c("Pat.nr.", 
"OS", "Event", "Age", "Favorable", "Intermediate", "Adverse"), row.names = c(NA, 
50L), class = "data.frame")

#row&columns
n_row <- dim(example1)[1]
n_col <- dim(example1)[2]

#variables:
OS <- c(example1[1:n_row,2])
Event <- c(example1[1:n_row,3])
age <- c(example1[1:n_row,4])
Favorable <- c(example1[1:n_row,5])
Intermediate <-  c(example1[1:n_row,6])
Adverse <- c(example1[1:n_row,7])




# dependent and independent variables
y <- Surv(OS, Event)
x <- cbind(age, Favorable, Intermediate, Adverse)  
example <- data.frame(cbind(x,y)) 

# dependent and independent variables
y <- Surv(OS, Event)
x <- cbind(age, Favorable, Intermediate, Adverse)  
example1 <- data.frame(cbind(x,y)) 

#coxphf with Firth's Penalized Likelihood

cox2<-coxphf(data=example,y~x, firth=TRUE, pl=TRUE, maxit=1000)
summary(cox2)

1 个答案:

答案 0 :(得分:0)

我对你的数据略有不同,没有任何问题。首先,我排除了Undefined列,因为没有任何变化的变量对您的模型没有任何作用(并且会产生错误)。

然后,我只是将您的Surv()对象绑定到您提供的数据:

example$survive <- Surv(time = OS, event = Event)

并使用默认设置调用coxphf()(默认为firthplTRUE),不更改默认迭代次数50,并使用了正确的公式规格:

cox2 <- coxphf(survive ~ Age + Favorable + Intermediate + Adverse, data = example)

> summary(cox2)
coxphf(formula = survive ~ Age + Favorable + Intermediate + Adverse, 
    data = example)

Model fitted by Penalized ML
Confidence intervals and p-values by Profile Likelihood 

                     coef   se(coef)  exp(coef)  lower 0.95 upper 0.95     Chisq          p
Age           0.005652315 0.01598045 1.00566832 0.976272961  1.0391381 0.1297188 0.71872373
Favorable    -3.335329337 1.25086821 0.03560286 0.004236556  0.4169422 6.3231629 0.01191709
Intermediate -1.672995547 0.61363779 0.18768401 0.066460220  0.6321023 6.4756709 0.01093610
Adverse      -3.412597661 1.25225958 0.03295548 0.003909284  0.3863706 6.4164893 0.01130655

Likelihood ratio test=6.535659 on 4 df, p=0.1625574, n=50
Wald test = 7.794526 on 4 df, p = 0.09940164

Covariance-Matrix:
                       Age   Favorable Intermediate      Adverse
Age           0.0002553747 -0.00220437 -0.001027825 -0.001760528
Favorable    -0.0022043699  1.56467129  0.710597909  1.408715695
Intermediate -0.0010278246  0.71059791  0.376551335  0.705180703
Adverse      -0.0017605280  1.40871570  0.705180703  1.568154050

我可以问为什么Intermediate列的值设置为2而不是1?它不仅仅是分类变量的二元指标吗?