使单个表示在R中具有多个变量的表

时间:2014-07-13 06:33:24

标签: r aggregate reshape

我有以下数据框:

pt_no = rep(1:10, each=18)
group = rep(c('gp1','gp2'), each=90)
test = rep(1:6, each=3, length=180)
month = rep(c(0,1,3), length=180)
value = runif(180, 100,200)

oridf = data.frame(pt_no, group, test, month, value)

head(oridf)
  pt_no group test month    value
1     1   gp1    1     0 114.7907
2     1   gp1    1     1 119.3668
3     1   gp1    1     3 135.8100
4     1   gp1    2     0 124.4290
5     1   gp1    2     1 156.0008
6     1   gp1    2     3 115.7246
> 

我必须找到基于' test',' group'和'月'制作如下表格:

test_no gp1_0month  gp2_0month  gp1_1month  gp2_1month  gp1_3month  gp2_3month
Test_1  136 137 152 143 156 150
Test_2  130 129 81  78  86  80
Test_3  129 128 68  68  74  71
Test_4  40  40  45  43  47  46
Test_5  203 201 141 134 149 142
Test_6  170 166 134 116 139 125

(上表中的平均值仅用于描述)

我可以使用tapply,但它给了我2个表:

tapply(oridf$value, list(test,month,group), mean) 
, , gp1

         0        1        3
1 147.5239 145.7311 151.6526
2 157.8421 131.0775 144.3387
3 144.2670 146.8478 170.7292
4 150.6332 172.0349 147.2165
5 131.4145 161.2294 143.2634
6 142.6708 150.4848 160.5059

, , gp2

         0        1        3
1 142.3145 157.7935 152.4228
2 131.5410 163.1386 145.8485
3 134.6620 136.7388 167.1557
4 122.4177 164.5213 124.0728
5 154.2681 165.0370 152.8372
6 154.4926 141.0391 147.2471

如何获得单个平均值表?谢谢你的帮助。

2 个答案:

答案 0 :(得分:3)

我只是建议来自“resahpe2”的dcast,因为您已经在使用该软件包(根据您对之前问题的接受答案判断)。您可以在dcast内进行汇总,因此您无需使用tapply

dcast(oridf, test ~ group + month, value.var = "value", fun.aggregate = mean)
#   test    gp1_0    gp1_1    gp1_3    gp2_0    gp2_1    gp2_3
# 1    1 137.1429 133.8151 160.4778 157.0084 141.9559 158.0573
# 2    2 158.8491 164.0129 149.3565 167.2719 137.5862 150.1176
# 3    3 173.7005 157.0834 141.3190 139.5480 139.2146 168.2849
# 4    4 145.5688 142.9972 131.5501 151.9991 160.3696 141.8310
# 5    5 162.7410 152.9081 150.7274 163.1464 159.3328 154.4541
# 6    6 150.8428 151.3530 157.7583 138.8394 140.2631 159.7671

另一个选项(我主要分享tidyr的工作方式)是使用tidyr + dplyr,如下所示:

library(dplyr)
# devtools::install_github("hadley/tidyr")
library(tidyr)
oridf %>% 
  group_by(group, test, month) %>%    # Columns to group by
  summarise(value = mean(value)) %>%  # Calculate the mean of value
  unite(GM, group, month) %>%         # Combine the group and month columns
  spread(GM, value)                   # widen the result
# Source: local data frame [6 x 7]
# 
#   test    gp1_0    gp1_1    gp1_3    gp2_0    gp2_1    gp2_3
# 1    1 137.1429 133.8151 160.4778 157.0084 141.9559 158.0573
# 2    2 158.8491 164.0129 149.3565 167.2719 137.5862 150.1176
# 3    3 173.7005 157.0834 141.3190 139.5480 139.2146 168.2849
# 4    4 145.5688 142.9972 131.5501 151.9991 160.3696 141.8310
# 5    5 162.7410 152.9081 150.7274 163.1464 159.3328 154.4541
# 6    6 150.8428 151.3530 157.7583 138.8394 140.2631 159.7671

当然,由于您在生成示例数据时未使用set.seed(),因此我的值与您的值不匹配。对于这个答案,我使用了set.seed(1)。 : - )

答案 1 :(得分:1)

使用dplyr

library(dplyr)
oridf_grp = group_by(oridf, test, month, group)
means = summarise(oridf_grp, mn = mean(value))
means

Source: local data frame [36 x 4]
Groups: test, month

   test month group       mn
1     1     0   gp1 140.2762
2     1     0   gp2 145.8591
3     1     1   gp1 136.6484
4     1     1   gp2 144.1533
5     1     3   gp1 133.9756
6     1     3   gp2 143.8203
7     2     0   gp1 176.7885
8     2     0   gp2 133.6210
9     2     1   gp1 131.5861
10    2     1   gp2 144.7439
<snip>

或者你可以melt输出tapply

library(reshape2)
res_tapply = tapply(oridf$value, list(test,month,group), mean) 
melt(res_tapply)
   Var1 Var2 Var3    value
1     1    0  gp1 140.2762
2     2    0  gp1 176.7885
3     3    0  gp1 140.5861
4     4    0  gp1 156.3823
5     5    0  gp1 160.6399
6     6    0  gp1 143.4665
7     1    1  gp1 136.6484
8     2    1  gp1 131.5861
9     3    1  gp1 144.1809
10    4    1  gp1 122.7579
<snip>