按行和ID识别事件的发生

时间:2014-11-19 21:40:32

标签: r

我有一个数据框,其中包含一个发生事件(1)而没有事件(0)的面板。如何在任何行中识别出a和b都发生过的ID(例如Id2和3)?在ID 1和5中,a和b出现在任一列中而不是两者中。

示例数据和我希望拥有的内容(列出现)如下所示

structure(list(id = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 
5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L), a = c(0L, 0L, 1L, 
0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L), b = c(1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L), occur = c(0L, 0L, 1L, 1L, 
1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L)), .Names = c("id", "a", "b", "occur"), class = "data.frame", row.names = c(NA, 
-21L))

3 个答案:

答案 0 :(得分:3)

以下是两种方式,两种方式都给出了相同的结果

tmp <- structure(list(id = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 
                             5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L), a = c(0L, 0L, 1L, 
                                                                                0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
                                                                                1L, 0L), b = c(1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
                                                                                               1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L)), .Names = c("id", "a", "b"
                                                                                               ), class = "data.frame", row.names = c(NA, -21L))

表示唯一ID

cbind(unique(tmp$id), 
      c(by(tmp, tmp$id, FUN = function(x) all(colSums(x[, 2:3]) > 0))))

#   [,1] [,2]
# 1    1    0
# 2    2    1
# 3    3    1
# 4    4    0
# 5    5    0
# 6    6    1
# 7    7    0
# 8    8    0
# 9    9    1

在数据中添加一列

within(tmp, {
  res <- ave(1:nrow(tmp), tmp$id, FUN = function(x) all(colSums(tmp[x, 2:3]) > 0))
})

#    id a b res 
# 1   1 0 1   0
# 2   1 0 1   0
# 3   2 1 0   1
# 4   2 0 1   1
# 5   3 1 0   1
# 6   3 0 1   1
# 7   3 1 0   1
# 8   4 1 0   0
# 9   4 1 0   0
# 10  5 1 0   0
# 11  5 1 0   0
# 12  6 1 0   1
# 13  6 0 1   1
# 14  7 0 1   0
# 15  7 0 1   0
# 16  8 0 1   0
# 17  8 0 1   0
# 18  8 0 1   0
# 19  9 0 1   1
# 20  9 1 0   1
# 21  9 0 1   1

替代方法:

library(plyr)
ddply(tmp, .(id), summarise, res = sum(a) > 0 & sum(b) > 0)
ddply(tmp, .(id), transform, res = sum(a) > 0 & sum(b) > 0)

library(dplyr)
group_by(tmp, id) %>% summarise(res = sum(a) > 0 & sum(b) > 0)
group_by(tmp, id) %>% mutate(res = sum(a) > 0 & sum(b) > 0)

答案 1 :(得分:3)

这是data.table解决方案

library(data.table)
setDT(df)[, occur := as.numeric(sum(a) > 0 & sum(b) > 0), by = id][]
#    id a b occur
# 1:  1 0 1     0
# 2:  1 0 1     0
# 3:  2 1 0     1
# 4:  2 0 1     1
# 5:  3 1 0     1
# 6:  3 0 1     1
# 7:  3 1 0     1
# ...
# ...

答案 2 :(得分:2)

首先聚合,然后识别ID:

aggr_mydf <- aggregate( mydf[,c('a','b')]  , by=list(mydf$id), FUN='sum')
colnames(aggr_mydf) <- c('id','a','b') #optional if you care about the names
aggr_mydf$both <- apply(aggr_mydf,1,function(x) if(all(x)>0){1} else{0})

> aggr_mydf
       id a b both
1       1 0 2    0
2       2 1 1    1
3       3 2 1    1
4       4 2 0    0
5       5 2 0    0
6       6 1 1    1
7       7 0 2    0
8       8 0 3    1
9       9 1 2    1

mydf <- merge(x = mydf, y = aggr_mydf, by = "id", all.x = TRUE)
mydf <- mydf[c(-4,-5)]
colnames(mydf) <- c('id','a','b','both')

> mydf
   id   a   b both
1   1   0   1    0
2   1   0   1    0
3   2   1   0    1
4   2   0   1    1
5   3   1   0    1
6   3   0   1    1
7   3   1   0    1 
8   4   1   0    0
9   4   1   0    0
10  5   1   0    0
11  5   1   0    0
12  6   1   0    1
13  6   0   1    1
14  7   0   1    0
15  7   0   1    0
16  8   0   1    0
17  8   0   1    0
18  8   0   1    0
19  9   0   1    1
20  9   1   0    1
21  9   0   1    1