我在df中对行进行求和时遇到了一些麻烦,我想在每个组内部汇总studentLab
和BAD
行,如果它们存在的话!
UGLY
我试过这个,但当然因为我的逻辑不好而无法正常工作!
group = c(seq(1,1.4,0.2),rep(seq(1.6,2,0.2),c(3,3,2)))
clas=c(rep("BAD",3),rep(c("BAD","GOOD","UGLY"),2),rep(c("BAD","GOOD"),1))
n=c(rep(1000,3),96,180,715,190,184,26,124,874)
df <- data.frame(group,clas,n)
> df
# group clas n
#1 1.0 BAD 1000
#2 1.2 BAD 1000
#3 1.4 BAD 1000
#4 1.6 BAD 96
#5 1.6 GOOD 180
#6 1.6 UGLY 715
#7 1.8 BAD 190
#8 1.8 GOOD 184
#9 1.8 UGLY 26
#10 2.0 BAD 124
#11 2.0 GOOD 874
错误:无效&#39; envir&#39;类型&#39;逻辑&#39;
如果代码可以修复,预期输出
library(dplyr)
df %>%
group_by(group) %>%
mutate(sum = ifelse(all(clas=="BAD"),n,ifelse(with(clas=="BAD"&clas=="UGLY"),n["BAD"]+n["UGLY"],"NA")))
谢谢!
答案 0 :(得分:2)
使用base R
aggregate
和merge
的其他选项。
我们可以aggregate
n
超过group
忽略clas
GOOD
,然后merge
将group
忽略为原始数据框获得预期的行数。
merge(df, aggregate(n~group, df[!df$clas == "GOOD", ], sum), by = "group",
suffixes = c("", "Sum"))
# group clas n Sum
#1 1.0 BAD 1000 1000
#2 1.2 BAD 1000 1000
#3 1.4 BAD 1000 1000
#4 1.6 BAD 96 811
#5 1.6 GOOD 180 811
#6 1.6 UGLY 715 811
#7 1.8 BAD 190 216
#8 1.8 GOOD 184 216
#9 1.8 UGLY 26 216
#10 2.0 BAD 124 124
#11 2.0 GOOD 874 124
答案 1 :(得分:1)
我们可以使用data.table
。将'data.frame'转换为'data.table'(setDT(df)
),按'group'分组,使用{{1'在'clas'中获取基于'BAD','UGLY'元素的逻辑索引使用此索引对'n'进行子集化,获取%in%
并将其分配(sum
)到新列'Sum'。
:=
在1e6数据集上,基准是
library(data.table)
setDT(df)[, Sum := sum(n[as.character(clas) %chin% c("BAD", "UGLY")]), by = group]
df
# group clas n Sum
#1: 1.0 BAD 1000 1000
#2: 1.2 BAD 1000 1000
#3: 1.4 BAD 1000 1000
#4: 1.6 BAD 96 811
#5: 1.6 GOOD 180 811
#6: 1.6 UGLY 715 811
#7: 1.8 BAD 190 216
#8: 1.8 GOOD 184 216
#9: 1.8 UGLY 26 216
#10: 2.0 BAD 124 124
#11: 2.0 GOOD 874 124
如果我们将行数从1e6增加到1e7
set.seed(24)
df1 <- data.frame(group = sample(1:1000, 1e6, replace=TRUE),
clas = sample(c("BAD", "GOOD", "UGLY"), 1e6, replace=TRUE),
n = sample(100:1000, 1e6, replace=TRUE))
df2 <- copy(df1)
system.time(setDT(df1)[, Sum := sum(n[as.character(clas) %chin% c("BAD", "UGLY")]), by = group])
# user system elapsed
# 0.04 0.02 0.06
system.time(merge(df2, aggregate(n~group, df2[!df2$clas == "GOOD", ], sum), by = "group",
suffixes = c("", "Sum")))
# user system elapsed
# 5.00 0.16 5.17