使用aggregate.data.frame函数汇总r data.frame中的数据

时间:2018-01-30 14:26:02

标签: r aggregate

我在r中有一个数据框,如下所示:

df = data.frame(matrix(rnorm(81), 9, 9))
colnames(df) = paste(rep(c('A', 'B', 'C'), each=3), rep(1:3, 3), sep='_')

> DF

         A_1         A_2        A_3        B_1        B_2        B_3        C_1        C_2         C_3
1 -1.4817028 -0.93474622  0.9294563  0.5773591  1.6658266 -2.2085816  0.7558782  0.1081266 -0.06831281
2 -1.3450369  1.34397438  1.3131486 -1.0451836  0.7909890 -0.4384893  0.1592584 -0.5894082 -1.80112265
3 -0.1212665 -1.74758183  1.1508606 -1.4708514  0.5012202  2.2396080 -1.7806906 -0.5606705  0.47215608
4  1.5809814 -0.93809925 -1.4442092  1.2810774  0.8366406  1.3915957 -1.0293784 -0.2156407  0.28290155
5  0.1782249 -0.30044139 -0.4122052 -0.9317817 -0.2969675 -0.8981296 -1.1330188  0.7853961  2.36157898
6 -1.4715701  0.31381770 -0.5048414  0.2679510 -1.1814695  0.9175016 -1.3791885 -1.0530426  0.30800623
7  0.3365095  0.69957024 -1.1665948 -1.4119800  0.4551571 -0.6020066 -0.8411100 -0.6023301 -0.19985798
8  1.0209572 -0.95542517 -1.7121831  0.2743081 -0.3398493 -0.6939674 -0.6430090  0.2723998  0.29528847
9 -0.3743835 -0.02206172  0.1315961 -0.2842016  1.3371216 -0.7539037  1.3974252  1.2905418 -1.74462796

我试图通过计算所有A,B和C的平均值来汇总这个数据帧。

我的方法是使用aggregate.data.frame函数:

aggregate.data.frame(df, by=list(rep(1:3, each=3)), FUN='mean')

我认为这会有效但我还没有完全正确,因为这是输出的样子:

Group.1         A_1         A_2        A_3        B_1        B_2        B_3         C_1        C_2        C_3
1       A -0.98266872 -0.44611789  1.1311552 -0.6462253  0.9860119 -0.1358210 -0.28851800 -0.3473174 -0.4657598
2       B  0.09587872 -0.30824098 -0.7870853  0.2057489 -0.2139321  0.4703226 -1.18052855 -0.1610957  0.9841623
3       C  0.32769438 -0.09263888 -0.9157273 -0.4739578  0.4841431 -0.6832925 -0.02889792  0.3202038 -0.5497325

输出应该是9乘3的数据帧。有人能说出我做错了什么吗?

为清楚起见,我希望输出看起来像这样:

            A          B          C
1 -1.63702823 -2.4111690  0.6868792
2 -0.82223674 -0.8007928  1.2550274
3 -1.11183424 -0.1849322  1.1610684
4  0.06883173  0.8440692  0.7233640
5  0.12595038  0.5271812  1.0493505
6 -1.45665365  1.2603648  0.9706727
7 -0.84226508  1.0921577  0.2829391
8  0.77141867 -0.8262696 -0.2910201
9  0.27212931 -0.2919420 -0.4680817

2 个答案:

答案 0 :(得分:4)

转换为data.table并创建行名列后,我们可以使用melt中的data.table。按'rn'分组,获取mean

中指定列的.SDcols
library(data.table)
melt(setDT(df, keep.rownames = TRUE), measure = patterns("A_\\d+", "B_\\d+", "C_\\d+"), 
    value.name = c("A", "B", "C"))[, lapply(.SD, mean), by = rn, .SDcols = A:C
                   ][, rn := NULL][]

或使用tidyverse

library(purrr)
library(dplyr)
library(magrittr)
map(as.character(1:3), ~ df %>%
             select(matches(.x))) %>% 
             reduce(`+`) %>% 
             divide_by(3) %>%
             rename_all(funs(sub("_.*", "", .)))

答案 1 :(得分:1)

另一个tidyverse回答:

library(tidyverse)
df %>% 
  rownames_to_column() %>% 
  gather(key, value, -rowname) %>% 
  separate(key, c("letter", "number"), "_") %>% 
  unite(id, letter, rowname, sep="_") %>%
  spread(key = number, value) %>% 
  mutate(sum = rowMeans(select_if(., is.numeric))) %>% 
  separate(id, c("letter", "number"), "_") %>% 
  select(letter, number, sum) %>% 
  spread(key = letter, value=sum) %>%
  select(-number)

一次一步地运行以查看正在进行的操作。