我们说我有data.frame
DF = structure(list(AE = c(148, 1789, 1223, 260, 1825, 37, 1442, 484,
10, 163, 1834, 254, 445, 837, 721, 1904, 1261, 382, 139, 213),
FW = structure(c(1L, 3L, 2L, 3L, 3L, 1L, 2L, 3L, 2L, 2L,
3L, 2L, 3L, 2L, 1L, 3L, 1L, 1L, 1L, 3L), .Label = c("LYLR",
"OCXG", "BIYX"), class = "factor"), CP = c("WYB/NXO", "HUK/NXO",
"HUK/WYB", "HUK/NXO", "WYB/NXO", "HUK/WYB", "HUK/NXO", "HUK/NXO",
"WYB/NXO", "HUK/NXO", "WYB/NXO", "HUK/NXO", "HUK/WYB", "WYB/NXO",
"HUK/WYB", "WYB/NXO", "WYB/NXO", "HUK/WYB", "WYB/NXO", "WYB/NXO"
), SD = c(1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1,
-1, 1, -1, 1, 1, 1)), .Names = c("AE", "FW", "CP", "SD"), row.names = c(NA, -20L), class = "data.frame")
或以人类可读的格式:
AE FW CP SD
1 148 LYLR WYB/NXO 1
2 1789 BIYX HUK/NXO 1
3 1223 OCXG HUK/WYB -1
4 260 BIYX HUK/NXO 1
5 1825 BIYX WYB/NXO 1
6 37 LYLR HUK/WYB 1
7 1442 OCXG HUK/NXO 1
8 484 BIYX HUK/NXO -1
9 10 OCXG WYB/NXO 1
10 163 OCXG HUK/NXO 1
11 1834 BIYX WYB/NXO -1
12 254 OCXG HUK/NXO -1
13 445 BIYX HUK/WYB 1
14 837 OCXG WYB/NXO -1
15 721 LYLR HUK/WYB -1
16 1904 BIYX WYB/NXO 1
17 1261 LYLR WYB/NXO -1
18 382 LYLR HUK/WYB 1
19 139 LYLR WYB/NXO 1
20 213 BIYX WYB/NXO 1
作为data.table,
DT = data.table(DF)
setkey(DT, CP)
现在考虑以下两个操作:
DT[, amount_sum_fh := DT[.(CP = CP),
on = .(CP), mean(AE * SD), by=.EACHI]$V1]
DT[, amount_sum_sh := DT[.(CP = CP),
on = .(CP), mean(AE), by=.EACHI]$V1]
有没有办法一举完成?
答案 0 :(得分:2)
问题,评论和答案到目前为止已经提出了三种方法:
要确定替代品的最快,可以使用microbenchmark
包:
library(data.table)
DT = data.table(DF)
setkey(DT, CP)
mb <- microbenchmark::microbenchmark(
OP = {
DT[, amount_sum_fh := DT[.(CP = CP),
on = .(CP), mean(AE * SD), by=.EACHI]$V1]
DT[, amount_sum_sh := DT[.(CP = CP),
on = .(CP), mean(AE), by=.EACHI]$V1]
},
Frank = DT[, `:=`(amount_sum_fh = mean(AE*SD), amount_sum_sh = mean(AE)), by = CP],
DF = transform(DF,
amount_sum_fh = ave(AE * SD, CP, FUN = mean),
amount_sum_sh = ave(AE, CP, FUN = mean)),
times = 100L
)
mb
#Unit: microseconds
# expr min lq mean median uq max neval cld
# OP 4090.271 4288.2800 4614.9625 4417.2700 4633.7880 7470.179 100 c
# Frank 548.833 612.9355 687.6306 643.5160 711.5745 1142.041 100 a
# DF 725.649 769.8660 840.5960 811.9315 870.3365 1376.425 100 b
即使样本量相当小,Frank的data.table
版本也比基本R版本快25%左右。
答案 1 :(得分:1)
这是使用基数R的一种方法:
DF2 <- transform(DF,
amount_sum_fh=ave(AE * SD, CP, FUN = mean),
amount_sum_sh=ave(AE, CP, FUN = mean))
但data.frame
并未按CP排序。
> DF2
AE FW CP SD amount_sum_fh amount_sum_sh
1 148 LYLR WYB/NXO 1 34.11111 907.8889
2 1789 BIYX HUK/NXO 1 486.00000 732.0000
3 1223 OCXG HUK/WYB -1 -216.00000 561.6000
4 260 BIYX HUK/NXO 1 486.00000 732.0000
5 1825 BIYX WYB/NXO 1 34.11111 907.8889
6 37 LYLR HUK/WYB 1 -216.00000 561.6000
7 1442 OCXG HUK/NXO 1 486.00000 732.0000
8 484 BIYX HUK/NXO -1 486.00000 732.0000
9 10 OCXG WYB/NXO 1 34.11111 907.8889
10 163 OCXG HUK/NXO 1 486.00000 732.0000
11 1834 BIYX WYB/NXO -1 34.11111 907.8889
12 254 OCXG HUK/NXO -1 486.00000 732.0000
13 445 BIYX HUK/WYB 1 -216.00000 561.6000
14 837 OCXG WYB/NXO -1 34.11111 907.8889
15 721 LYLR HUK/WYB -1 -216.00000 561.6000
16 1904 BIYX WYB/NXO 1 34.11111 907.8889
17 1261 LYLR WYB/NXO -1 34.11111 907.8889
18 382 LYLR HUK/WYB 1 -216.00000 561.6000
19 139 LYLR WYB/NXO 1 34.11111 907.8889
20 213 BIYX WYB/NXO 1 34.11111 907.8889