通过2个变量计算统计函数

时间:2016-05-16 07:33:03

标签: r aggregate

我需要知道如何根据其他2个变量计算变量的mean, max,sd。例如,这是下面的数据集:我想得到牛奶的区域,渠道方向,牛奶最大区域方向通道等的平均值

Rg  CHn Milk    Grc
1   1   7209    4897
1   1   2154    6824
2   1   2280    2112
2   2   11487   9490
3   1   685     2216
3   2   891     5226

2 个答案:

答案 0 :(得分:0)

强烈建议您使用dplyr软件包以获得更清晰的工作流程。以下是iris的示例:

data(iris)
iris %>% 
     select(-Species) %>%  # remove "Species" variable from iris for following function
     summarise_each(funs(mean, max, sd))
  Sepal.Length_mean Sepal.Width_mean Petal.Length_mean Petal.Width_mean Sepal.Length_max Sepal.Width_max Petal.Length_max Petal.Width_max Sepal.Length_sd
1          5.843333         3.057333             3.758         1.199333              7.9             4.4              6.9             2.5       0.8280661
  Sepal.Width_sd Petal.Length_sd Petal.Width_sd
1      0.4358663        1.765298      0.7622377

按物种获得mean,max和sd:

iris %>% 
     group_by(Species) %>%
     summarise_each(funs(mean, max, sd))

Source: local data frame [3 x 13]

 Species Sepal.Length_mean Sepal.Width_mean Petal.Length_mean Petal.Width_mean Sepal.Length_max Sepal.Width_max Petal.Length_max Petal.Width_max
  (fctr)             (dbl)            (dbl)             (dbl)            (dbl)            (dbl)           (dbl)            (dbl)           (dbl)
1     setosa             5.006            3.428             1.462            0.246              5.8             4.4              1.9             0.6
2 versicolor             5.936            2.770             4.260            1.326              7.0             3.4              5.1             1.8
3  virginica             6.588            2.974             5.552            2.026              7.9             3.8              6.9             2.5
Variables not shown: Sepal.Length_sd (dbl), Sepal.Width_sd (dbl), Petal.Length_sd (dbl), Petal.Width_sd (dbl)

从2个变量获得平均值max和sd的另一个例子:

data(mtcars)
mtcars %>%
       group_by(gear, carb) %>% # grouping by two variables
       summarise_each(funs(mean, max, sd))

Source: local data frame [11 x 29]
Groups: gear [?]

   gear  carb mpg_mean cyl_mean disp_mean hp_mean drat_mean  wt_mean qsec_mean vs_mean am_mean mpg_max cyl_max disp_max hp_max drat_max wt_max qsec_max vs_max
   (dbl) (dbl)    (dbl)    (dbl)     (dbl)   (dbl)     (dbl)    (dbl)     (dbl)   (dbl)   (dbl)   (dbl)   (dbl)    (dbl)  (dbl)    (dbl)  (dbl)    (dbl)  (dbl)
1      3     1 20.33333 5.333333  201.0333   104.0    3.1800 3.046667  19.89000     1.0     0.0    21.5       6    258.0    110     3.70  3.460    20.22      1
2      3     2 17.15000 8.000000  345.5000   162.5    3.0350 3.560000  17.06000     0.0     0.0    19.2       8    400.0    175     3.15  3.845    17.30      0
3      3     3 16.30000 8.000000  275.8000   180.0    3.0700 3.860000  17.66667     0.0     0.0    17.3       8    275.8    180     3.07  4.070    18.00      0
4      3     4 12.62000 8.000000  416.4000   228.0    3.2200 4.685800  16.89400     0.0     0.0    14.7       8    472.0    245     3.73  5.424    17.98      0
5      4     1 29.10000 4.000000   84.2000    72.5    4.0575 2.072500  19.22000     1.0     1.0    33.9       4    108.0     93     4.22  2.320    19.90      1
6      4     2 24.75000 4.000000  121.0500    79.5    4.1625 2.683750  20.00500     1.0     0.5    30.4       4    146.7    109     4.93  3.190    22.90      1
7      4     4 19.75000 6.000000  163.8000   116.5    3.9100 3.093750  17.67000     0.5     0.5    21.0       6    167.6    123     3.92  3.440    18.90      1
8      5     2 28.20000 4.000000  107.7000   102.0    4.1000 1.826500  16.80000     0.5     1.0    30.4       4    120.3    113     4.43  2.140    16.90      1
9      5     4 15.80000 8.000000  351.0000   264.0    4.2200 3.170000  14.50000     0.0     1.0    15.8       8    351.0    264     4.22  3.170    14.50      0
10     5     6 19.70000 6.000000  145.0000   175.0    3.6200 2.770000  15.50000     0.0     1.0    19.7       6    145.0    175     3.62  2.770    15.50      0
11     5     8 15.00000 8.000000  301.0000   335.0    3.5400 3.570000  14.60000     0.0     1.0    15.0       8    301.0    335     3.54  3.570    14.60      0
Variables not shown: am_max (dbl), mpg_sd (dbl), cyl_sd (dbl), disp_sd (dbl), hp_sd (dbl), drat_sd (dbl), wt_sd (dbl), qsec_sd (dbl), vs_sd (dbl), am_sd (dbl)

请参阅https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf了解可以节省大量数据处理时间的有用技巧。

答案 1 :(得分:0)

我会使用dplyr包。如果您的数据位于名为df的data.frame中,则会为您提供带有区域摘要的data.frame:

library(dplyr)
df %>% group_by(Rg) %>% summarize(mean=mean(Milk), sd=sd(Milk), max=max(Milk))
# Source: local data frame [3 x 4]
#
#      Rg   mean       sd   max
#   (int)  (dbl)    (dbl) (int)
# 1     1 4681.5 3574.425  7209
# 2     2 6883.5 6510.332 11487
# 3     3  788.0  145.664   891

编辑:如果您需要同时执行这两项操作:

df %>% group_by(Rg, CHn) %>% summarize(mean=mean(Milk), sd=sd(Milk), max=max(Milk))
# Source: local data frame [5 x 5]
# Groups: Rg [?]
# 
#      Rg   CHn    mean       sd   max
#   (int) (int)   (dbl)    (dbl) (int)
# 1     1     1  4681.5 3574.425  7209
# 2     2     1  2280.0      NaN  2280
# 3     2     2 11487.0      NaN 11487
# 4     3     1   685.0      NaN   685
# 5     3     2   891.0      NaN   891