我有一个数据框,其中包含一个发生事件(1)而没有事件(0)的面板。如何在任何行中识别出a和b都发生过的ID(例如Id2和3)?在ID 1和5中,a和b出现在任一列中而不是两者中。
示例数据和我希望拥有的内容(列出现)如下所示
structure(list(id = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L,
5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L), a = c(0L, 0L, 1L,
0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L), b = c(1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L), occur = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L)), .Names = c("id", "a", "b", "occur"), class = "data.frame", row.names = c(NA,
-21L))
答案 0 :(得分:3)
以下是两种方式,两种方式都给出了相同的结果
tmp <- structure(list(id = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L,
5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L), a = c(0L, 0L, 1L,
0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L), b = c(1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L)), .Names = c("id", "a", "b"
), class = "data.frame", row.names = c(NA, -21L))
表示唯一ID
cbind(unique(tmp$id),
c(by(tmp, tmp$id, FUN = function(x) all(colSums(x[, 2:3]) > 0))))
# [,1] [,2]
# 1 1 0
# 2 2 1
# 3 3 1
# 4 4 0
# 5 5 0
# 6 6 1
# 7 7 0
# 8 8 0
# 9 9 1
在数据中添加一列
within(tmp, {
res <- ave(1:nrow(tmp), tmp$id, FUN = function(x) all(colSums(tmp[x, 2:3]) > 0))
})
# id a b res
# 1 1 0 1 0
# 2 1 0 1 0
# 3 2 1 0 1
# 4 2 0 1 1
# 5 3 1 0 1
# 6 3 0 1 1
# 7 3 1 0 1
# 8 4 1 0 0
# 9 4 1 0 0
# 10 5 1 0 0
# 11 5 1 0 0
# 12 6 1 0 1
# 13 6 0 1 1
# 14 7 0 1 0
# 15 7 0 1 0
# 16 8 0 1 0
# 17 8 0 1 0
# 18 8 0 1 0
# 19 9 0 1 1
# 20 9 1 0 1
# 21 9 0 1 1
替代方法:
library(plyr)
ddply(tmp, .(id), summarise, res = sum(a) > 0 & sum(b) > 0)
ddply(tmp, .(id), transform, res = sum(a) > 0 & sum(b) > 0)
library(dplyr)
group_by(tmp, id) %>% summarise(res = sum(a) > 0 & sum(b) > 0)
group_by(tmp, id) %>% mutate(res = sum(a) > 0 & sum(b) > 0)
答案 1 :(得分:3)
这是data.table
解决方案
library(data.table)
setDT(df)[, occur := as.numeric(sum(a) > 0 & sum(b) > 0), by = id][]
# id a b occur
# 1: 1 0 1 0
# 2: 1 0 1 0
# 3: 2 1 0 1
# 4: 2 0 1 1
# 5: 3 1 0 1
# 6: 3 0 1 1
# 7: 3 1 0 1
# ...
# ...
答案 2 :(得分:2)
首先聚合,然后识别ID:
aggr_mydf <- aggregate( mydf[,c('a','b')] , by=list(mydf$id), FUN='sum')
colnames(aggr_mydf) <- c('id','a','b') #optional if you care about the names
aggr_mydf$both <- apply(aggr_mydf,1,function(x) if(all(x)>0){1} else{0})
> aggr_mydf
id a b both
1 1 0 2 0
2 2 1 1 1
3 3 2 1 1
4 4 2 0 0
5 5 2 0 0
6 6 1 1 1
7 7 0 2 0
8 8 0 3 1
9 9 1 2 1
mydf <- merge(x = mydf, y = aggr_mydf, by = "id", all.x = TRUE)
mydf <- mydf[c(-4,-5)]
colnames(mydf) <- c('id','a','b','both')
> mydf
id a b both
1 1 0 1 0
2 1 0 1 0
3 2 1 0 1
4 2 0 1 1
5 3 1 0 1
6 3 0 1 1
7 3 1 0 1
8 4 1 0 0
9 4 1 0 0
10 5 1 0 0
11 5 1 0 0
12 6 1 0 1
13 6 0 1 1
14 7 0 1 0
15 7 0 1 0
16 8 0 1 0
17 8 0 1 0
18 8 0 1 0
19 9 0 1 1
20 9 1 0 1
21 9 0 1 1