我的数据集如下所示:
block <- c(1,1,2,2,3,3,4,4)
treatment <- c(1,1,2,2,1,1,2,2)
type <- c("adult1","adult2","adult1","adult2","adult1","adult2","adult1","adult2")
t1 <- c(1,1,2,2,3,3,4,4)
t2 <- c(1,1,2,2,3,3,4,4)
t100 <- c(1,1,2,2,3,3,4,4)
df <- data.frame(block,treatment, type,t1,t2,t100)
我希望仅针对每个时间点(t1,t2,t100)对成年人进行求和。这就是我希望最终输出像llok一样
block <- c(1,2,3,4)
treatment <- c(1,2,1,2)
type <- c("adult","adult","adult","adult")
t1 <- c(2,4,6,8)
t2 <- c(2,4,6,8)
t100 <- c(2,4,6,8)
df <- data.frame(block,treatment,type,t1,t2,t100
)
以下是我使用聚合函数的尝试:
aggregate(df[,3:5], by = list(df$block), FUN = sum)
我收到一条错误消息,指出“参数长度必须相同”。
答案 0 :(得分:2)
使用aggregate
,您可以使用公式来汇总t1:t100
并按block
和treatment
进行分组:
df_final = aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, sum)
df_final$type1 = "adult"
<强>结果:强>
block treatment t1 t2 t100 type1
1 1 1 2 2 2 adult
2 3 1 6 6 6 adult
3 2 2 4 4 4 adult
4 4 2 8 8 8 adult
或者您可以使用dplyr
:
library(dplyr)
df %>%
group_by(block, treatment) %>%
summarize_at(vars(t1:t100), sum) %>%
mutate(type1 = "adult")
或
df %>%
group_by(block, treatment) %>%
summarize_at(vars(2:4), sum) %>%
mutate(type1 = "adult")
<强>结果:强>
# A tibble: 4 x 6
# Groups: block [4]
block treatment t1 t2 t100 type1
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 2 2 2 adult
2 2 2 4 4 4 adult
3 3 1 6 6 6 adult
4 4 2 8 8 8 adult
您还可以使用支持列索引的data.table
:
library(data.table)
setDT(df)[, lapply(.SD, sum), by=.(block, treatment), .SDcols=4:6]
<强>结果:强>
block treatment t1 t2 t100
1: 1 1 2 2 2
2: 2 2 4 4 4
3: 3 1 6 6 6
4: 4 2 8 8 8
答案 1 :(得分:1)
基础R中的解决方案:
df <- cbind.data.frame(
aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, FUN = sum),
type = "adult");
# block treatment t1 t2 t100 type
#1 1 1 2 2 2 adult
#2 3 1 6 6 6 adult
#3 2 2 4 4 4 adult
#4 4 2 8 8 8 adult
注意:他们的密钥位于cbind
相关列。
以下是所有三种解决方案microbenchmark
的结果(基数R,dplyr
,data.table
)。
# Sample dataframe with 1000 rows
df <- cbind.data.frame(
block = rep(seq(1, 1000), each = 2),
treatment = rep(c(1, 1, 2, 2), length.out = 250),
type = rep(c("adult1", "adult2"), length.out = 500),
t1 = rep(seq(1, 1000), each = 2),
t2 = rep(seq(1, 1000), each = 2),
t100 = rep(seq(1, 1000), each = 2));
# Benchmarking results
require(microbenchmark);
require(dplyr);
require(magrittr);
require(data.table);
microbenchmark(
baseR = cbind.data.frame(
aggregate(cbind(t1, t2, t100) ~ block + treatment, data = df, FUN = sum),
type = "adult"),
dplyr = df %>%
group_by(block, treatment) %>%
summarize_at(vars(t1:t100), sum) %>%
mutate(type1 = "adult"),
datatable = setDT(df)[, lapply(.SD, sum), by=.(block, treatment), .SDcols=4:6]
)
#Unit: microseconds
# expr min lq mean median uq max neval
# baseR 13817.627 14040.4835 14931.4202 14278.8220 15026.413 42347.511 100
# dplyr 6698.983 7076.6360 8459.7861 7240.1680 7486.245 73401.747 100
# datatable 463.837 500.6555 663.5425 576.3075 597.443 9015.664 100