我有data.frame
的列表。例如
set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}
我想要的是有效地为每个id(在“id”列中)计算列表中所有数据帧上的“p”,“m”,“d”和“a”列的方差。理想情况下,这将返回data.frame
这样的(基于上面绘制的值):
> result.df
id var_p var_m var_d var_a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
答案 0 :(得分:3)
使用my_list
library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
或使用lapply
和apply
df = do.call(rbind, my_list)
df1 = do.call(rbind,
lapply(split(df, df$id),
function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))
out = transform(df1, id = row.names(df1))
#> out
# p m d a id
#a 0.2371569 1.7810729 0.08264279 0.5074250 a
#b 0.1091675 0.2107997 1.15051229 1.1578691 b
#c 0.5385789 0.7650123 0.44215343 0.3137903 c
#d 1.0174542 0.7818498 0.06414317 0.6079849 d
#e 0.7343667 1.2870542 1.41615858 0.7362462 e
或使用doBy
library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
或使用sqldf
library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m),
variance(d), variance(a) from df group by id")
#> out
# id variance(p) variance(m) variance(d) variance(a)
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
答案 1 :(得分:3)
这是基础R方法
dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
给出了
id p m d a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
答案 2 :(得分:3)
library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
# id p m d a
# 1: a 0.2371569 1.7810729 0.08264279 0.5074250
# 2: b 0.1091675 0.2107997 1.15051229 1.1578691
# 3: c 0.5385789 0.7650123 0.44215343 0.3137903
# 4: d 1.0174542 0.7818498 0.06414317 0.6079849
# 5: e 0.7343667 1.2870542 1.41615858 0.7362462
答案 3 :(得分:2)
已更新以使用bind_rows()
(根据@hadley建议,效率高于do.call(rbind,...)
)
library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))
# id p m d a
# 1 a 0.2371569 1.7810729 0.08264279 0.5074250
# 2 b 0.1091675 0.2107997 1.15051229 1.1578691
# 3 c 0.5385789 0.7650123 0.44215343 0.3137903
# 4 d 1.0174542 0.7818498 0.06414317 0.6079849
# 5 e 0.7343667 1.2870542 1.41615858 0.7362462