我想拍摄一堆由几百个分组变量分组的描述性统计数据。我知道How to group data.table by multiple columns?我可以在分组参数中使用list(),如果我想要分组变量组合的统计数据。在我的情况下,我希望Y的每个级别的平均值比Z的每个级别的平均值
# example data
set.seed(007)
DF <- data.frame(X=1:50000, Y=sample(c(0,1), 50000, TRUE), Z=sample(0:5, 50000, TRUE))
library(data.table)
DT <- data.table(DF)
# I tried this - but this gives the mean for each combination of Y and Z
DT[, mean(X), by=list(Y, Z)]
# so does this
DT[, mean(X), by=c("Y", "Z")]
# This works but....
out <- lapply( c( "Y","Z") , FUN= function(K){ DT[, mean(X), by=get(K)]})
out <- do.call( rbind, out )
#...but it is really slow.
我有1亿条记录和400多个分组变量,所以需要一些东西 - 有点高效。 lapply选项增加了几天的额外处理时间
options( digits=15 )
start.time <- Sys.time()
out <- lapply( c( "Y","Z") , FUN= function(K){ DT[, mean(X), by=get(K)]})
end.time <- Sys.time()
time.taken <- end.time - start.time
start.time <- Sys.time()
DT[, mean(X), by=c("Y")]
DT[, mean(X), by=c("Z")]
end.time <- Sys.time()
time.taken2 <- end.time - start.time
time.taken - time.taken2
答案 0 :(得分:5)
使用开发版本1.10.5,data.table
已获得分组集合聚合函数,用于计算生成多个(子)总计的不同分组级别的聚合。
library(data.table)
# data.table 1.10.5 IN DEVELOPMENT built 2018-01-31 02:23:45 UTC
grp_vars <- setdiff(names(DF), "X")
groupingsets(setDT(DF), mean(X), by = grp_vars, sets = as.list(grp_vars))
Y Z V1 1: 1 NA 24960.98 2: 0 NA 25039.96 3: NA 5 24652.44 4: NA 0 25006.61 5: NA 2 25223.83 6: NA 3 24959.26 7: NA 1 25095.58 8: NA 4 25068.84
# create data n_rows = 1e6L n_vars = 5 n_grps = 1e2L set.seed(007) DT <- data.table(rn = seq_len(n_rows)) for (i in seq_len(n_vars)) set(DT, , paste0("X", i), i*rnorm(n_rows)) for (i in seq_len(n_grps)) set(DT, , paste0("Z", i), sample(0:i, n_rows, TRUE)) grps <- grep("^Z", names(DT), value = TRUE) vars <- grep("^X", names(DT), value = TRUE) # run benchmark bm <- microbenchmark::microbenchmark( gs = { groupingsets(DT, lapply(.SD, mean), by = grps, sets = as.list(grps), .SDcols = vars) }, lapply1 = { rbindlist(lapply(grps, function(K) DT[, lapply(.SD, mean), by = K, .SDcols = vars]), fill = TRUE) }, lapply2 = { out <- lapply(grps, function(K) DT[, lapply(.SD, mean), by = get(K), .SDcols = vars]) do.call(rbind, out) }, times = 3L ) print(bm)
即使有1 M行和100个分组变量,运行时间没有显着差异(groupingsets()
似乎比其他两种方法慢一点):
Unit: seconds expr min lq mean median uq max neval gs 3.602689 3.606646 3.608343 3.610603 3.611169 3.611735 3 lapply1 3.524957 3.546060 3.561130 3.567163 3.579217 3.591270 3 lapply2 3.562424 3.569284 3.577199 3.576144 3.584586 3.593027 3