想象一下
给出data.table
中的R
library(data.table)
dtable = data.table(
id = c(rep(1, 3), rep(2, 4), rep(3, 2)),
time = c(seq(1, 3, 1), seq(1, 4, 1), seq(3, 4)),
state_1 = c('A', 'A', 'B', 'A', 'B', 'B', 'B', 'A', 'A'),
state_2 = c('A', 'B', 'A', NA, 'B', 'B', NA, 'A', 'A')
)
评估为
id time state_1 state_2
1: 1 1 A A
2: 1 2 A B
3: 1 3 B A
4: 2 1 A <NA>
5: 2 2 B B
6: 2 3 B B
7: 2 4 B <NA>
8: 3 3 A A
9: 3 4 A A
我希望跟踪每行中每个状态处于当前状态的时间。我希望我的数据都是左检查而不是。即对于每个NA
的第一次观察,一个解决方案应始终返回id
,直到观察到状态的变化。另一个解决方案应该处理第一个观察,就像状态刚刚改变到那个状态。我的结果data.table
应该返回
id time state_1 state_2 time_in_state_1_censored time_in_state_2_censored time_in_state_1 time_in_state_2
1: 1 1 A A NA NA 0 0
2: 1 2 A B NA 0 1 0
3: 1 3 B A 0 0 0 0
4: 2 1 A <NA> NA NA 0 0
5: 2 2 B B 0 0 0 0
6: 2 3 B B 1 1 1 1
7: 2 4 B <NA> 2 0 2 0
8: 3 3 A A NA NA 0 0
9: 3 4 A A NA NA 1 1
我使用rle
(在id < 3
上)部分解决了非审查部分
dtable[id < 3,
(paste0('time_in_', columns)) :=
lapply(.SD, function(col) unlist(sapply(rle(col)$lengths, function(x) 1:x-1))),
by='id', .SDcols = columns]
但我相信它可以更聪明,更强大,更高效地解决。
答案 0 :(得分:1)
我已通过以下
解决了这个问题dtable[,
(paste0('time_in_', columns, '_censored')) :=
lapply(.SD, function(col) {
rles = rle(col)
res = rep(NA, rles$lengths[1])
if (length(rles$lengths) > 1){
res = c(res, unlist(sapply(rle(col)$lengths[-1], function(x) 1:x-1)))
}
return(as.integer(res))
}),
by='id', .SDcols = columns]
dtable[,
(paste0('time_in_', columns)) :=
lapply(.SD, function(col) {
rles = rle(col)
if (length(rles$lengths) > 1){
res = unlist(sapply(rle(col)$lengths, function(x) 1:x-1))
} else {
res = 0:(rles$lengths[1]-1)
}
return(as.integer(res))
}),
by='id', .SDcols = columns]
评估为
id time state_1 state_2 time_in_state_1_censored time_in_state_2_censored time_in_state_1 time_in_state_2
1: 1 1 A A NA NA 0 0
2: 1 2 A B NA 0 1 0
3: 1 3 B A 0 0 0 0
4: 2 1 A <NA> NA NA 0 0
5: 2 2 B B 0 0 0 0
6: 2 3 B B 1 1 1 1
7: 2 4 B <NA> 2 0 2 0
8: 3 3 A A NA NA 0 0
9: 3 4 A A NA NA 1 1
答案 1 :(得分:1)
未经审查的是
dtable[, v := rowid(rleid(state_1)) - 1L, by = id]
从那里,为了得到被审查的那个,我会......
# label spells in each state
dtable[, spell_num := rleid(state_1), by=id]
# overwrite with NA for the first spell
dtable[, vc := v][spell_num == 1L, vc := NA]
要为多个状态列执行此操作,我将使用循环:
for (s in sprintf("state_%s", 1:2)){
sid = sub(".*_(.*)$", "\\1", s)
outnm_un = sprintf("v_%s", sid)
outnm_cs = sprintf("vc_%s", sid)
# label spells in each state
dtable[, spell_num := rleidv(.SD), by=id, .SDcols = s]
# create uncensored var
dtable[, (outnm_un) := rowid(spell_num) - 1L, by=id]
# overwrite with NA for the first spell to get the censored var
dtable[, (outnm_cs) := get(outnm_un)][spell_num == 1L, (outnm_cs) := NA]
}
# clean up
dtable[, spell_num := NULL]
rm(s, sid, outnm_un, outnm_cs)
给出了
id time state_1 state_2 v vc v_1 vc_1 v_2 vc_2
1: 1 1 A A 0 NA 0 NA 0 NA
2: 1 2 A B 1 NA 1 NA 0 0
3: 1 3 B A 0 0 0 0 0 0
4: 2 1 A NA 0 NA 0 NA 0 NA
5: 2 2 B B 0 0 0 0 0 0
6: 2 3 B B 1 1 1 1 1 1
7: 2 4 B NA 2 2 2 2 0 0
8: 3 3 A A 0 NA 0 NA 0 NA
9: 3 4 A A 1 NA 1 NA 1 NA
简化修改
按照上面的解决方案,它可以压缩成
columns = c('state_1', 'state_2')
censor = TRUE
dtable[, (paste0('time_in_', columns)) := lapply(.SD, function(sd_col){
spell_num = rleid(sd_col)
v = rowid(spell_num) - 1
if (isTRUE(censor)) v[spell_num == 1] <- NA
v
}), by=id, .SDcols = columns]