Andersen-Gill计算过程中使用survival进行Cox回归的无风险区间:: tmerge()

时间:2017-06-15 00:21:37

标签: r survival-analysis cox-regression

我想使用tmerge()函数转换数据集,以便在重复事件的Cox回归框架的Andersen-Gill扩展中使用。见Therneau的excellent vignette

我想指明个人在事件发生后30天内对重复事件免疫,即我希望个人暂时退出风险集,以便在个人不在以下情况时发生事件 - 风险,它被忽略了。

一种原始方法是迭代地添加所有事件,然后简单地将30添加到tstart变量。但是,这可能导致实例tstart >= tstop,并且在更大和更复杂的数据集中将是灾难性的。

我试图利用forloop来利用tmerge()函数来纠正我上面提到的问题。对于此示例,我将使用生存包中的cgd数据。

编辑:请参阅下面更正的forloop

library(survival)
cgd0 <- cgd0
newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)

for(i in 1:7){        
    x <- paste0("etime", i)  #etime1:etime7

# iteratively add each event
    newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))

# select only observations that end in an event and iteratively create
# cumulative number of events for each individual
    newcgd <- tmerge(newcgd, subset(newcgd, infect == 1),
                     id = id, cum_infect = cumtdc(tstop))

# for each loop add 30 days to the start time of the ith cumulative event
    newcgd[which(newcgd$cum_infect == i), "tstart"] <-
           newcgd[which(newcgd$cum_infect == i), "tstart"] + 30

# for each loop remove observations were the start time >= stop time
    newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
}

attr(newcgd, "tcount")
#            early late gap within boundary leading trailing tied
#infect         0    0   0     44        0       0        0    0
#cum_infect     0    0   0      0       44       0        0    0
#infect         0    0   4     11        0       1        1    0
#cum_infect     0    0   0      0       11       0       45    0
#infect         0    0   2      6        0       0        0    0
#cum_infect     0    0   0      0        6       0       56    0
#infect         0    0   1      2        0       0        0    0
#cum_infect     0    0   0      0        6       0       58    0
#infect         0    0   0      2        0       0        0    0
#cum_infect     0    0   0      0        8       0       58    0
#infect         0    0   0      1        0       0        0    0
#cum_infect     0    0   0      0        9       0       58    0
#infect         0    0   0      1        0       0        0    0
#cum_infect     0    0   0      0       10       0       58    0 

我相信这个解决方案是正确的。然而,这是生存分析中的常见问题,我担心

i)我忽略了一些东西而且代码没有做我认为它做的事情。

ii)我在R

中忽略了一种经过验证的方法

iii)如果i)和ii)不是问题,我相信这段代码效率低下,并且想知道是否有明显的方法可以提高执行速度。

-------------------------------------------- -------------------------------------------------- -------------------------------------

编辑:使用评论进一步进行错误检查。希望这能澄清我试图做的事情。从概念上;我指的是个人在经历一次事件后30天内不会有再次发生其他事件的风险。在Andersen-Gill计数过程公式中,每一行代表一个包含开始时间tstart和停止时间tstop以及指示符(在本例中为infect)的观察结果观察是否因事件infect == 1或审查infect == 0而结束。在这里,我手动完成上述forloop中的步骤,并量化每个循环发生的事件数量以及指定30天免疫期间的总跟踪时间。然后将相同的代码实现为forloop以实现完整性。结果显示在下面的单独代码块中。

newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)

###1st event

x <- "etime1"
immunecgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 1), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 1), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 1), "tstart"] <- newcgd[which(newcgd$cum_infect == 1), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime1 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime1 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

###2nd event
x <- "etime2"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 2), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 2), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 2), "tstart"] <- newcgd[which(newcgd$cum_infect == 2), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime2 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime2 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

###3rd event
x <- "etime3"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 3), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 3), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 3), "tstart"] <- newcgd[which(newcgd$cum_infect == 3), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime3 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime3 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

###4th event
x <- "etime4"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 4), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 4), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 4), "tstart"] <- newcgd[which(newcgd$cum_infect == 4), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime4 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime4 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

###5th event
x <- "etime5"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 5), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 5), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 5), "tstart"] <- newcgd[which(newcgd$cum_infect == 5), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime5 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime5 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

###6th event
x <- "etime6"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 6), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 6), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 6), "tstart"] <- newcgd[which(newcgd$cum_infect == 6), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime6 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime6 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

###7th event
x <- "etime7"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 7), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 7), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]

newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 7), "tstart"] <- newcgd[which(newcgd$cum_infect == 7), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

etime7 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime7 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))

df_event <- rbind.data.frame(etime1, etime2, etime3, etime4, etime5, etime6, etime7)
colnames(df_event) <- c("immunity", "no_immunity")
df_event$diff <- df_event$no_immunity - df_event$immunity

df_futime <- rbind.data.frame(futime1, futime2, futime3, futime4, futime5, futime6, futime7)
colnames(df_futime)  <- c("immunity", "no_immunity")
df_futime$diff <- df_futime$no_immunity - df_futime$immunity

与forloop相同的代码。

 newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
 immunecgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)

event <- matrix(NA, nrow = 7, ncol = 2)
futime <- matrix(NA, nrow = 7, ncol = 2)
for(i in 1:7){        
    x <- paste0("etime", i)  #etime1:etime7

    # iteratively add each event
    immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
    newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))

    # select only observations that end in an event and iteratively create
    # cumulative number of events for each individual
    immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
    newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))

    # for each loop add 30 days to the start time of the ith cumulative event
    immunecgd[which(immunecgd$cum_infect == i), "tstart"] <- immunecgd[which(immunecgd$cum_infect == i), "tstart"] + 30
    newcgd[which(newcgd$cum_infect == i), "tstart"] <- newcgd[which(newcgd$cum_infect == i), "tstart"]

    # for each loop remove observations were the start time >= stop time
    immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
    newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

    event[i,] <- c(sum(immunecgd$infect), sum(newcgd$infect))
    futime[i,] <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
}

event <- data.frame(event)
colnames(event) <- c("immunity", "no_immunity")
event$diff <- event$no_immunity - event$immunity

futime <- data.frame(futime)
colnames(futime) <- c("immunity", "no_immunity")
futime$diff <- futime$no_immunity - futime$immunity

上面的错误测试代码给出了以下结果

df_event
  immunity no_immunity diff
1       44          44    0
2       56          61    5
3       62          69    7
4       64          72    8
5       66          74    8
6       67          75    8
7       68          76    8

df_futime
  immunity no_immunity diff
1    36202       37477 1275
2    35935       37477 1542
3    35875       37477 1602
4    35875       37477 1602
5    35875       37477 1602
6    35875       37477 1602
7    35875       37477 1602

-------------------------------------------- -------------------------------------------------- -------------------------------------

通过对survival包中的不同数据集,模拟数据集和我自己的个人数据集(我希望使用此代码的数据集)进行进一步测试,我发现了一个“故障”。在上面的代码版本中,如果新事件etime[i-1]属于其中一个时期,那么我们已经指定了个人不具备事件 - 这正是代码旨在创建的实例 - 事件未纳入累积事件计数器cum_infect。在下一次运行etime[i]期间,个人将只有[i-1]个累积事件,并且控制是否应将30天添加到开始时间的代码部分

immunecgd[which(immunecgd$cum_infect == i), "tstart"] <- immunecgd[which(immunecgd$cum_infect == i), "tstart"] + 30

不会将个人识别为有事件。这意味着forloop只会在事件发生后正确添加30天的免疫力,直到事件的第一次发生在这样的免疫期。我提出了一个相当不优雅的解决方案。但它的确有效。

newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
immunecgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
newcgd$cum_infect_0 <- 0
immunecgd$cum_infect_0 <- 0
event <- matrix(NA, nrow = 7, ncol = 2)
futime <- matrix(NA, nrow = 7, ncol = 2)
for(i in 1:7){        
    x <- paste0("etime", i)  #etime1:etime7

    # iteratively add each event
    immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
    newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))

    # select only observations that end in an event and iteratively create
    # cumulative number of events for each individual
    immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
    newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))

    # create new column that will hold cumulative events between loops
    immunecgd[, paste0("cum_infect_", i)] <- immunecgd[, "cum_infect"]
    newcgd[, paste0("cum_infect_", i)] <- newcgd[, "cum_infect"]

    # for each loop add 30 days to the start time if there is atleast one cumulative event
    # and the value of the ith cumulative event is larger than the i-1th cumulative event
    immunecgd[which(immunecgd$cum_infect > 0 & immunecgd$cum_infect > immunecgd[, paste0("cum_infect_", i - 1)]), "tstart"] <-
        immunecgd[which(immunecgd$cum_infect > 0 & immunecgd$cum_infect > immunecgd[, paste0("cum_infect_", i - 1)]), "tstart"] + 30
    newcgd[which(newcgd$cum_infect > 0 & newcgd$cum_infect > newcgd[, paste0("cum_infect_", i - 1)]), "tstart"] <-
        newcgd[which(newcgd$cum_infect > 0 & newcgd$cum_infect > newcgd[, paste0("cum_infect_", i - 1)]), "tstart"]

    # for each loop remove observations were the start time >= stop time
    immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
    newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]

    event[i,] <- c(sum(immunecgd$infect), sum(newcgd$infect))
    futime[i,] <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
}
immunecgd <- immunecgd[,!grepl("cum_infect_", colnames(immunecgd))]
newcgd <- newcgd[,!grepl("cum_infect_", colnames(newcgd))]

event <- data.frame(event)
colnames(event) <- c("immunity", "no_immunity")
event$diff <- event$no_immunity - event$immunity

futime <- data.frame(futime)
colnames(futime) <- c("immunity", "no_immunity")
futime$diff <- futime$no_immunity - futime$immunity 

在这里,我们可以看到事件总数的差异

  immunity no_immunity diff
1       44          44    0
2       56          61    5
3       62          69    7
4       64          72    8
5       65          74    9
6       66          75    9
7       66          76   10

正确指定forloop已经发现另外2个实例是免疫期间发生的事件。

1 个答案:

答案 0 :(得分:0)

跟进我的评论,这是我在尝试在代码中实现它时看到的内容:

 with( newcgd, table( tstart-tstop <= 30, infect))
 #-------------
      infect
         0   1
  TRUE 120  68

所以,如果我理解你的目标,我不会认为你在那里,我想知道你是否搞砸了,因为:

> newcgd$infect <- with( newcgd,ifelse(infect, tstart-tstop > 30, 0 ) )
> with( newcgd, table( tstart-tstop <= 30, infect))
      infect
         0
  TRUE 188

当我将所有短间隔事件设置为0时,我根本没有事件。但也许我还没有理解这些问题?