所以,我之前问过这个问题,但是对于Excel中的数组公式。但是,我的数据集太大,以至于电子表格无法响应,因此我希望能够在R中运行它以提高效率。
Link以前回答的问题
我会在此重申并扩展它。
Day Type Val
1 A 5
1 B 6
1 C 9
1 D 7
2 B 2
2 A 8
2 C 3
2 D 3
3 C 4
3 B 2
3 A 2
4 A 5
4 B 9
4 C 8
5 A 7
5 B 5
6 A 6
6 B 3
6 C 4
7 A 7
7 B 9
我在上面提供了一个简化的数据框(我的数千行有10行,有数百个'Type')。我希望能够分配'类型',然后
计算当天不存在指定“类型”的剩余“类型”的平均值“Val”
计算当天所分配的“类型”存在的剩余“类型”的平均值“Val”
例如,如果我指定'Type'C,那么在第5天和第7天就不存在C.
输出将是:
Type Average_Without Average_With
A 7.0 5.2
B 7.0 4.4
C NA 5.2
D NA 5.0
答案 0 :(得分:9)
以下是使用data.table
包获取所需值的方法(但@DavidArenburg解决方案更好,并且会产生您想要的输出):
library(data.table)
# Average_with:
setDT(df)[Day %in% Day[Type=="C"], mean(Val), by=Type]
# Type V1
#1: A 5.2
#2: B 4.4
#3: C 5.6
#4: D 5.0
# Average_without
df[!Day %in% Day[Type=="C"], mean(Val), by=Type]
# Type V1
#1: A 7
#2: B 7
修改,以获得您的输出:
setDT(df)[, C:=(Day %in% Day[Type=="C"])]
res <- df[, mean(Val), by=.(Type, C)]
dcast(res, Type~C, value.var="V1")
# Type FALSE TRUE
#1: A 7 5.2
#2: B 7 4.4
#3: C NA 5.6
#4: D NA 5.0
或更直接,感谢@Frank:
setDT(df)[, C := Day %in% df[Type=="C", unique(Day)]]
dcast(df, Type~C, value.var="Val", fun=mean)
一些基准
set.seed(123)
DF <- df <- data.frame(Day=sample(1:1000, 50000, replace=TRUE),
Type=paste0(sample(letters[1:15], 50000, replace=TRUE), sample(letters[1:15], 50000, replace=TRUE)),
Val=rnorm(50000), stringsAsFactors=FALSE)
library(microbenchmark)
# the test is made on "mo" (205 days with, 795 days without)
microbenchmark(cath1(df), cath2(df), cath_Frank(df), david1(df), david2(df), GG1(DF), GG2(DF), GG3(DF), GG4(DF), unit="relative")
# expr min lq mean median uq max neval cld
# cath1(df) 1.3533329 1.4811559 1.4137217 1.502278 1.4146092 1.0523168 100 a
# cath2(df) 1.0000000 1.0000000 1.0000000 1.000000 1.0000000 1.0000000 100 a
# cath_Frank(df) 1.2985873 1.2980500 1.2380200 1.312180 1.2882213 0.9784906 100 a
# david1(df) 0.8642457 0.8717645 0.9768656 1.012679 0.9367868 0.9864712 100 a
# david2(df) 1.1708477 1.1723941 1.2105785 1.307281 1.2139049 0.9624526 100 a
# GG1(DF) 13.8436430 13.7552020 13.0925864 13.727017 15.9302047 3.0718886 100 d
# GG2(DF) 4.8765172 4.8827384 4.9342907 4.943654 4.5173281 1.6818194 100 b
# GG3(DF) 8.8005347 8.8393882 9.6084771 9.048975 11.9310902 4.1580238 100 c
# GG4(DF) 4.4787631 4.5812781 4.5098152 4.623952 4.2268167 1.5829500 100 b
答案 1 :(得分:9)
使用data.table
您也可以尝试
library(data.table)
# Get the days where Type == "C" - You probably don't need unique at all
indx <- unique(setDT(df)[Type == "C", Day])
# Calculate average by including/excluding these days by Type
df[,
.(Without = mean(Val[!Day %in% indx]),
With = mean(Val[Day %in% indx])),
by = Type]
# Type Without With
# 1: A 7 5.2
# 2: B 7 4.4
# 3: C NaN 5.6
# 4: D NaN 5.0
我们可以通过减少计算来增强它,但是更难以读取代码
indx <- unique(setDT(df)[Type == "C", Day])
df[, {
indx2 <- Day %in% indx ## Calculate this only once and reuse it
.(
Without = mean(Val[!indx2]),
With = mean(Val[indx2])
)
},
by = Type]
# Type Without With
# 1: A 7 5.2
# 2: B 7 4.4
# 3: C NaN 5.6
# 4: D NaN 5.0
答案 2 :(得分:7)
我的理解是,所需要的是Val
按Type
表示其行中C
和Val
的平均值Type
的平均值1}}对于那些行中没有C
的日子。
前三个解决方案不使用任何包。
1)聚合这会在每个行子集上使用aggregate
。我们首先创建hasC
这是一个逻辑向量,每行DF
有一个组件,表示该行是否属于C
的那一天。
hasC <- ave(DF$Type == "C", DF$Day, FUN = any)
m <- merge(aggregate(Val ~ Type, DF[!hasC, ], mean),
aggregate(Val ~ Type, DF[hasC, ], mean), by = 1, all = TRUE)
names(m) <- c("Type", "Avg_wo", "Avg_with")
给出这个data.frame:
> m
Type Avg_wo Avg_with
1 A 7 5.2
2 B 7 4.4
3 C NA 5.6
4 D NA 5.0
2)tapply hasCf
与hasC
类似,但已成为一个因素:
hasC <- ave(DF$Type == "C", DF$Day, FUN = any)
hasCf <- factor(hasC, levels = c(FALSE, TRUE), labels = c("Avg_without", "Avg_with"))
tapply(DF$Val, list(DF$Type, hasCf), mean)
给出这个矩阵,其中行名称是类型:
Avg_without Avg_with
A 7 5.2
B 7 4.4
C NA 5.6
D NA 5.0
3)xtabs hasCf
与(2)中的相同。第一个xtabs
计算总和,第二个计算计数。比率是平均值:
hasC <- ave(DF$Type == "C", DF$Day, FUN = any)
hasCf <- factor(hasC, levels = c(FALSE, TRUE), labels = c("Avg_without", "Avg_with"))
xtabs(Val ~ Type + hasCf, DF) / xtabs(~ Type + hasCf, DF)
给出这个c("xtabs", "table")
类对象:
hasCf
Type Avg_without Avg_with
A 7.0 5.2
B 7.0 4.4
C 5.6
D 5.0
4)dplyr 以下是使用dplyr和tidyr软件包的解决方案:
library(dplyr)
library(tidyr)
DF %>%
group_by(Day) %>%
mutate(hasC = factor("C" %in% Type, levels = c(FALSE, TRUE),
labels = c("Avg_without", "Avg_with"))) %>%
ungroup() %>%
group_by(Type, hasC) %>%
summarize(mean = mean(Val)) %>%
ungroup() %>%
spread(hasC, mean)
,并提供:
Source: local data frame [4 x 3]
Type Avg_without Avg_with
(fctr) (dbl) (dbl)
1 A 7 5.2
2 B 7 4.4
3 C NA 5.6
4 D NA 5.0
注意:可重复形式的输入DF
为:
DF <- structure(list(Day = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 6L, 6L, 6L, 7L, 7L), Type = structure(c(1L,
2L, 3L, 4L, 2L, 1L, 3L, 4L, 3L, 2L, 1L, 1L, 2L, 3L, 1L, 2L, 1L,
2L, 3L, 1L, 2L), .Label = c("A", "B", "C", "D"), class = "factor"),
Val = c(5L, 6L, 9L, 7L, 2L, 8L, 3L, 3L, 4L, 2L, 2L, 5L, 9L,
8L, 7L, 5L, 6L, 3L, 4L, 7L, 9L)), .Names = c("Day", "Type",
"Val"), class = "data.frame", row.names = c(NA, -21L))