我有以下数据框:
pt_no = rep(1:10, each=18)
group = rep(c('gp1','gp2'), each=90)
test = rep(1:6, each=3, length=180)
month = rep(c(0,1,3), length=180)
value = runif(180, 100,200)
oridf = data.frame(pt_no, group, test, month, value)
head(oridf)
pt_no group test month value
1 1 gp1 1 0 114.7907
2 1 gp1 1 1 119.3668
3 1 gp1 1 3 135.8100
4 1 gp1 2 0 124.4290
5 1 gp1 2 1 156.0008
6 1 gp1 2 3 115.7246
>
我必须找到基于' test',' group'和'月'制作如下表格:
test_no gp1_0month gp2_0month gp1_1month gp2_1month gp1_3month gp2_3month
Test_1 136 137 152 143 156 150
Test_2 130 129 81 78 86 80
Test_3 129 128 68 68 74 71
Test_4 40 40 45 43 47 46
Test_5 203 201 141 134 149 142
Test_6 170 166 134 116 139 125
(上表中的平均值仅用于描述)
我可以使用tapply,但它给了我2个表:
tapply(oridf$value, list(test,month,group), mean)
, , gp1
0 1 3
1 147.5239 145.7311 151.6526
2 157.8421 131.0775 144.3387
3 144.2670 146.8478 170.7292
4 150.6332 172.0349 147.2165
5 131.4145 161.2294 143.2634
6 142.6708 150.4848 160.5059
, , gp2
0 1 3
1 142.3145 157.7935 152.4228
2 131.5410 163.1386 145.8485
3 134.6620 136.7388 167.1557
4 122.4177 164.5213 124.0728
5 154.2681 165.0370 152.8372
6 154.4926 141.0391 147.2471
如何获得单个平均值表?谢谢你的帮助。
答案 0 :(得分:3)
我只是建议来自“resahpe2”的dcast
,因为您已经在使用该软件包(根据您对之前问题的接受答案判断)。您可以在dcast
内进行汇总,因此您无需使用tapply
:
dcast(oridf, test ~ group + month, value.var = "value", fun.aggregate = mean)
# test gp1_0 gp1_1 gp1_3 gp2_0 gp2_1 gp2_3
# 1 1 137.1429 133.8151 160.4778 157.0084 141.9559 158.0573
# 2 2 158.8491 164.0129 149.3565 167.2719 137.5862 150.1176
# 3 3 173.7005 157.0834 141.3190 139.5480 139.2146 168.2849
# 4 4 145.5688 142.9972 131.5501 151.9991 160.3696 141.8310
# 5 5 162.7410 152.9081 150.7274 163.1464 159.3328 154.4541
# 6 6 150.8428 151.3530 157.7583 138.8394 140.2631 159.7671
另一个选项(我主要分享tidyr
的工作方式)是使用tidyr
+ dplyr
,如下所示:
library(dplyr)
# devtools::install_github("hadley/tidyr")
library(tidyr)
oridf %>%
group_by(group, test, month) %>% # Columns to group by
summarise(value = mean(value)) %>% # Calculate the mean of value
unite(GM, group, month) %>% # Combine the group and month columns
spread(GM, value) # widen the result
# Source: local data frame [6 x 7]
#
# test gp1_0 gp1_1 gp1_3 gp2_0 gp2_1 gp2_3
# 1 1 137.1429 133.8151 160.4778 157.0084 141.9559 158.0573
# 2 2 158.8491 164.0129 149.3565 167.2719 137.5862 150.1176
# 3 3 173.7005 157.0834 141.3190 139.5480 139.2146 168.2849
# 4 4 145.5688 142.9972 131.5501 151.9991 160.3696 141.8310
# 5 5 162.7410 152.9081 150.7274 163.1464 159.3328 154.4541
# 6 6 150.8428 151.3530 157.7583 138.8394 140.2631 159.7671
当然,由于您在生成示例数据时未使用set.seed()
,因此我的值与您的值不匹配。对于这个答案,我使用了set.seed(1)
。 : - )
答案 1 :(得分:1)
使用dplyr
:
library(dplyr)
oridf_grp = group_by(oridf, test, month, group)
means = summarise(oridf_grp, mn = mean(value))
means
Source: local data frame [36 x 4]
Groups: test, month
test month group mn
1 1 0 gp1 140.2762
2 1 0 gp2 145.8591
3 1 1 gp1 136.6484
4 1 1 gp2 144.1533
5 1 3 gp1 133.9756
6 1 3 gp2 143.8203
7 2 0 gp1 176.7885
8 2 0 gp2 133.6210
9 2 1 gp1 131.5861
10 2 1 gp2 144.7439
<snip>
或者你可以melt
输出tapply
:
library(reshape2)
res_tapply = tapply(oridf$value, list(test,month,group), mean)
melt(res_tapply)
Var1 Var2 Var3 value
1 1 0 gp1 140.2762
2 2 0 gp1 176.7885
3 3 0 gp1 140.5861
4 4 0 gp1 156.3823
5 5 0 gp1 160.6399
6 6 0 gp1 143.4665
7 1 1 gp1 136.6484
8 2 1 gp1 131.5861
9 3 1 gp1 144.1809
10 4 1 gp1 122.7579
<snip>