有条件地选择数据帧行

时间:2013-09-08 16:01:42

标签: r dataframe

我有以下生存分析数据框。 ID是主题ID,event是事件的发生与否,time是每次观察的时间。

test.df<-data.frame(expand.grid(id=c("A","B","C","D","E"),event=c(0,0,1,0,1,0,0,1,0,1,0,1,1,0,0,1,0,1,0,0,1,0,1,0,1,1,0,0,1,0,1)))
test.df$time=round(rnorm(155,5,1),1)

我想将数据框减少到以下内容:对于每个id,所有行直到(但包括)event的第一次出现= 1。每一行(无论事件= 0或事件= 1), 后每个ID中的第一个事件= 1都不应出现在缩减数据框中。

请注意length(unique(test.df$time))<length(test.df$time)(如果使用ave()解决方案)。

此外,可能存在ID,其中第一个观察结果是事件= 1,因此它们不会包含在简化数据帧中。

有一个很好的方法吗?

我尝试test.df$cumsum<-ave(test.df$event, test.df$id, cumsum),但似乎没有按照我的预期工作。

编辑:这是一个具有DWin要求的数据框

test.df<-structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L), .Label = c("A", "B", "C", "D", "E"), class = "factor"), 
    event = c(1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 
    1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 
    0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 
    1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 
    1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 
    0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 
    0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 
    0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 
    0, 0, 0, 1, 0, 0), time = c(2.9, 3.6, 3.6, 3.7, 4.2, 4.2, 
    4.3, 4.3, 4.4, 4.6, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5, 5.1, 
    5.1, 5.2, 5.3, 5.4, 5.4, 5.4, 5.5, 5.6, 5.6, 6.1, 6.3, 6.7, 
    6.8, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 4.1, 4.3, 4.3, 4.4, 4.4, 
    4.5, 4.5, 4.6, 4.7, 4.7, 4.8, 4.8, 5.1, 5.1, 5.2, 5.3, 5.4, 
    5.5, 5.6, 5.8, 5.9, 5.9, 6.3, 6.7, 7, 3.3, 3.9, 3.9, 3.9, 
    4.1, 4.2, 4.2, 4.3, 4.3, 4.3, 4.3, 4.4, 4.6, 4.6, 5.2, 5.2, 
    5.3, 5.4, 5.4, 5.4, 5.5, 5.5, 5.5, 5.5, 5.6, 5.8, 5.8, 5.8, 
    6.3, 6.5, 6.7, 2.9, 3.3, 3.7, 4, 4.1, 4.2, 4.3, 4.4, 4.5, 
    4.6, 4.6, 4.7, 4.8, 4.9, 4.9, 4.9, 5.1, 5.2, 5.5, 5.5, 5.7, 
    5.7, 5.8, 5.8, 5.9, 6, 6, 6, 6.4, 6.7, 6.8, 2.9, 3, 3.6, 
    3.6, 3.8, 4.1, 4.4, 4.5, 4.6, 4.6, 4.6, 4.6, 4.7, 4.8, 5, 
    5, 5, 5.1, 5.2, 5.3, 5.5, 5.5, 5.5, 5.7, 5.9, 5.9, 5.9, 5.9, 
    6.1, 6.2, 6.7)), .Names = c("id", "event", "time"), row.names = c(101L, 
1L, 146L, 141L, 106L, 151L, 66L, 111L, 131L, 6L, 91L, 121L, 21L, 
46L, 16L, 26L, 86L, 56L, 76L, 31L, 136L, 41L, 61L, 126L, 36L, 
71L, 81L, 11L, 51L, 116L, 96L, 107L, 112L, 67L, 72L, 2L, 27L, 
77L, 92L, 127L, 47L, 122L, 32L, 82L, 117L, 97L, 132L, 17L, 42L, 
57L, 152L, 147L, 37L, 137L, 12L, 102L, 7L, 52L, 87L, 62L, 22L, 
142L, 128L, 8L, 18L, 113L, 138L, 3L, 78L, 13L, 48L, 73L, 108L, 
143L, 38L, 148L, 68L, 153L, 98L, 23L, 63L, 118L, 53L, 88L, 93L, 
103L, 83L, 33L, 58L, 133L, 43L, 123L, 28L, 44L, 109L, 94L, 144L, 
104L, 89L, 114L, 129L, 59L, 39L, 124L, 19L, 29L, 54L, 69L, 139L, 
14L, 84L, 9L, 134L, 4L, 74L, 24L, 64L, 34L, 49L, 79L, 149L, 119L, 
154L, 99L, 120L, 135L, 125L, 150L, 35L, 15L, 5L, 20L, 40L, 65L, 
80L, 155L, 60L, 145L, 10L, 30L, 50L, 95L, 140L, 90L, 110L, 115L, 
130L, 70L, 25L, 55L, 75L, 100L, 105L, 45L, 85L), class = "data.frame")

1 个答案:

答案 0 :(得分:0)

使用此:

subset(within(test.df, {
    cumsum <- ave(event, id, FUN=cumsum)
    lagcumsum <- ave(cumsum, id, FUN=function(x)c(NA,head(x,-1)))
}), cumsum==0 | cumsum==1 & lagcumsum==0)

结果

    id event time lagcumsum cumsum
107  B     0  3.5        NA      0
112  B     1  3.6         0      1
44   D     0  2.9        NA      0
109  D     0  3.3         0      0
94   D     0  3.7         0      0
144  D     1  4.0         0      1
120  E     0  2.9        NA      0
135  E     0  3.0         0      0
125  E     1  3.6         0      1

编辑:现在正确删除每个ID中event==1的第一行,前提是前一行没有event==0;