我正在努力适应Cox回归(需要包装:生存),并遇到了问题。当我尝试将“常规”Cox回归模型与我的数据拟合时,我收到错误消息“X矩阵被认为是单数;变量9”(如果我删除变量9,则问题变为变量8)。据我所知,问题出现了,因为有太多这些变量的患者有相同的事件(我相信另一个问题,这被称为“完美分类”)
这就是为什么我试图使用coxphf函数(同名包)来拟合Cox模型,因为这应该通过对Cox回归使用“Firth的惩罚最大似然偏差减少方法”来解决问题。但这似乎也不起作用,直到我将“maxit”从默认值50增加到1000并从等式中删除“undefined”变量。但是如果我从我的数据集中删除未定义的变量(它只有1个人),那么该模型似乎不再起作用。
所以我的问题是,我该如何解决这个问题?从数据集中删除整个变量(因此是1个人)是否合适/必要?可能会犯一些非常明显的错误,但请耐心等待,因为我绝对没有统计背景。非常感谢你提前。
我包含了以下示例数据,以及我尝试拟合Cox模型。 所以这就是我通过从模型中省略“未定义”变量来设法让模型工作的方式:
# load packages
library("survival")
library ("coxphf")
example<-structure(list(Pat.nr. = c(1L, 2L, 5L, 7L, 8L, 10L, 13L, 14L,
15L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 26L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 39L, 41L, 42L, 44L, 45L, 46L, 47L,
48L, 49L, 50L, 52L, 53L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L,
67L, 68L, 69L), OS = c(1.6, 34.6, 1.5, 35.8, 7.7, 38.6, 37.6,
8.6, 0.6, 5.7, 0.6, 43.9, 25.8, 7.3, 28.1, 43.8, 12.8, 18.5,
36.1, 43.1, 15.4, 37.6, 8.6, 2.7, 10.2, 8.1, 37.3, 25.3, 3.7,
26.1, 41.2, 5.9, 15.5, 56.8, 29.5, 52.1, 5.4, 54.8, 53.5, 16.6,
49.2, 53.8, 8.5, 56, 7.4, 28, 3.3, 38, 55.7, 0.4), Event = c(1L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
1L), Age = c(68.41, 54.9, 44.44, 64.14, 68.86, 62.93, 40.76,
31.06, 42.97, 69.16, 47.39, 60.14, 27.9, 56.57, 19.63, 47.75,
45.58, 66.22, 43.73, 45.34, 38.83, 54.46, 48.91, 70.3, 60.51,
68.55, 63.18, 55.89, 68.27, 57.25, 56.17, 60.83, 74.42, 71.3,
40.36, 50.85, 59.61, 50.14, 45.77, 19.34, 56.32, 53.38, 70.7,
55.25, 56.05, 44.06, 51.36, 69.37, 69.71, 75.44), Favorable = c(0L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L), Intermediate = c(0L, 2L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 2L,
0L, 2L, 2L, 0L, 2L, 0L, 2L, 2L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L,
2L, 0L, 0L, 2L, 2L, 2L, 0L, 2L, 2L, 0L, 0L, 2L, 0L, 2L, 0L, 0L,
0L, 2L, 2L, 0L, 2L, 0L, 2L, 2L), Adverse = c(1L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Undefined = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L)), .Names = c("Pat.nr.", "OS", "Event", "Age", "Favorable",
"Intermediate", "Adverse", "Undefined"), row.names = c(NA, 50L
), class = "data.frame")
#row&columns
n_row <- dim(example)[1]
n_col <- dim(example)[2]
#variables:
OS <- c(example[1:n_row,2])
Event <- c(example[1:n_row,3])
age <- c(example[1:n_row,4])
Favorable <- c(example[1:n_row,5])
Intermediate <- c(example[1:n_row,6])
Adverse <- c(example[1:n_row,7])
Undefined <- c(example[1:n_row,8])
# dependent and independent variables
y <- Surv(OS, Event)
x <- cbind(age, Favorable, Intermediate, Adverse, Undefined)
example <- data.frame(cbind(x,y))
#coxphf with Firth's Penalized Likelihood --> which doesn't seem to work
cox2<-coxphf(data=example,y~x, firth=TRUE, pl=TRUE, maxit=1000)
summary(cox2)
#coxphf with Firth's Penalized Likelihood (without Variable "Undefined") --> this works
cox2<-coxphf(data=example,y~age+Favorable+Intermediate+Adverse, firth=TRUE, pl=TRUE, maxit=1000)
summary(cox2)
在这里,我修改了数据集以不包含未定义的变量(模型不再起作用):
example1<-structure(list(Pat.nr. = c(1L, 2L, 5L, 7L, 8L, 10L, 13L, 14L,
15L, 17L, 19L, 20L, 21L, 22L, 23L, 26L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 39L, 41L, 42L, 44L, 45L, 46L, 47L, 48L,
49L, 50L, 52L, 53L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 67L,
68L, 69L, 72L), OS = c(1.6, 34.6, 1.5, 35.8, 7.7, 38.6, 37.6,
8.6, 0.6, 5.7, 43.9, 25.8, 7.3, 28.1, 43.8, 12.8, 18.5, 36.1,
43.1, 15.4, 37.6, 8.6, 2.7, 10.2, 8.1, 37.3, 25.3, 3.7, 26.1,
41.2, 5.9, 15.5, 56.8, 29.5, 52.1, 5.4, 54.8, 53.5, 16.6, 49.2,
53.8, 8.5, 56, 7.4, 28, 3.3, 38, 55.7, 0.4, 2.8), Event = c(1L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
1L), Age = c(68.41, 54.9, 44.44, 64.14, 68.86, 62.93, 40.76,
31.06, 42.97, 69.16, 60.14, 27.9, 56.57, 19.63, 47.75, 45.58,
66.22, 43.73, 45.34, 38.83, 54.46, 48.91, 70.3, 60.51, 68.55,
63.18, 55.89, 68.27, 57.25, 56.17, 60.83, 74.42, 71.3, 40.36,
50.85, 59.61, 50.14, 45.77, 19.34, 56.32, 53.38, 70.7, 55.25,
56.05, 44.06, 51.36, 69.37, 69.71, 75.44, 71.05), Favorable = c(0L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
1L), Intermediate = c(0L, 2L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 2L,
2L, 2L, 0L, 2L, 0L, 2L, 2L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 2L,
0L, 0L, 2L, 2L, 2L, 0L, 2L, 2L, 0L, 0L, 2L, 0L, 2L, 0L, 0L, 0L,
2L, 2L, 0L, 2L, 0L, 2L, 2L, 0L), Adverse = c(1L, 0L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L)), .Names = c("Pat.nr.",
"OS", "Event", "Age", "Favorable", "Intermediate", "Adverse"), row.names = c(NA,
50L), class = "data.frame")
#row&columns
n_row <- dim(example1)[1]
n_col <- dim(example1)[2]
#variables:
OS <- c(example1[1:n_row,2])
Event <- c(example1[1:n_row,3])
age <- c(example1[1:n_row,4])
Favorable <- c(example1[1:n_row,5])
Intermediate <- c(example1[1:n_row,6])
Adverse <- c(example1[1:n_row,7])
# dependent and independent variables
y <- Surv(OS, Event)
x <- cbind(age, Favorable, Intermediate, Adverse)
example <- data.frame(cbind(x,y))
# dependent and independent variables
y <- Surv(OS, Event)
x <- cbind(age, Favorable, Intermediate, Adverse)
example1 <- data.frame(cbind(x,y))
#coxphf with Firth's Penalized Likelihood
cox2<-coxphf(data=example,y~x, firth=TRUE, pl=TRUE, maxit=1000)
summary(cox2)
答案 0 :(得分:0)
我对你的数据略有不同,没有任何问题。首先,我排除了Undefined
列,因为没有任何变化的变量对您的模型没有任何作用(并且会产生错误)。
然后,我只是将您的Surv()
对象绑定到您提供的数据:
example$survive <- Surv(time = OS, event = Event)
并使用默认设置调用coxphf()
(默认为firth
和pl
为TRUE
),不更改默认迭代次数50,并使用了正确的公式规格:
cox2 <- coxphf(survive ~ Age + Favorable + Intermediate + Adverse, data = example)
> summary(cox2)
coxphf(formula = survive ~ Age + Favorable + Intermediate + Adverse,
data = example)
Model fitted by Penalized ML
Confidence intervals and p-values by Profile Likelihood
coef se(coef) exp(coef) lower 0.95 upper 0.95 Chisq p
Age 0.005652315 0.01598045 1.00566832 0.976272961 1.0391381 0.1297188 0.71872373
Favorable -3.335329337 1.25086821 0.03560286 0.004236556 0.4169422 6.3231629 0.01191709
Intermediate -1.672995547 0.61363779 0.18768401 0.066460220 0.6321023 6.4756709 0.01093610
Adverse -3.412597661 1.25225958 0.03295548 0.003909284 0.3863706 6.4164893 0.01130655
Likelihood ratio test=6.535659 on 4 df, p=0.1625574, n=50
Wald test = 7.794526 on 4 df, p = 0.09940164
Covariance-Matrix:
Age Favorable Intermediate Adverse
Age 0.0002553747 -0.00220437 -0.001027825 -0.001760528
Favorable -0.0022043699 1.56467129 0.710597909 1.408715695
Intermediate -0.0010278246 0.71059791 0.376551335 0.705180703
Adverse -0.0017605280 1.40871570 0.705180703 1.568154050
我可以问为什么Intermediate
列的值设置为2而不是1?它不仅仅是分类变量的二元指标吗?