Question

如何按列分组，然后计算R？

中每个其他列的平均值和标准差

例如，考虑着名的Iris数据集。我想做一些类似按物种分组的事情，然后计算花瓣/萼片长度/宽度测量值的平均值和sd。我知道这与split-apply-combine有关，但我不确定如何从那里开始。

我能想出什么：

{
  "shortName": "Group",
  "namespace": "CM.Models",
  "baseTypeName": "Entity",
  "autoGeneratedKeyType": "Identity",
  "defaultResourceName": "Groups",
  "dataProperties": [
    {
      "name": "groupID",
      "dataType": "String",
      "maxLength": 32,
      "defaultValue": "",
      "validators": [
        {
          "name": "maxLength",
          "maxLength": 32
        }
      ]
    },
    {
      "name": "group",
      "dataType": "String",
      "maxLength": 32,
      "defaultValue": "",
      "validators": [
        {
          "name": "required"
        },
        {
          "name": "maxLength",
          "maxLength": 32
        }
      ]
    },
    {
      "name": "groupMembers",
      "dataType": "String",
      "isScalar": false,
      "defaultValue": []
    }
  ]
}

期望的输出：

require(plyr)

x <- ddply(iris, .(Species), summarise,
    Sepal.Length.Mean = mean(Sepal.Length),
    Sepal.Length.Sd = sd(Sepal.Length),
    Sepal.Width.Mean = mean(Sepal.Width),
    Sepal.Width.Sd = sd(Sepal.Width),
    Petal.Length.Mean = mean(Petal.Length),
    Petal.Length.Sd = sd(Petal.Length),
    Petal.Width.Mean = mean(Petal.Width),
    Petal.Width.Sd = sd(Petal.Width))

     Species Sepal.Length.Mean Sepal.Length.Sd Sepal.Width.Mean Sepal.Width.Sd
1     setosa             5.006       0.3524897            3.428      0.3790644
2 versicolor             5.936       0.5161711            2.770      0.3137983
3  virginica             6.588       0.6358796            2.974      0.3224966
  Petal.Length.Mean Petal.Length.Sd Petal.Width.Mean Petal.Width.Sd
1             1.462       0.1736640            0.246      0.1053856
2             4.260       0.4699110            1.326      0.1977527
3             5.552       0.5518947            2.026      0.2746501

Answer 1

我们可以尝试dplyr

library(dplyr)
res <- iris %>% 
         group_by(Species) %>% 
         summarise_each(funs(mean, sd))
`colnames<-`(t(res[-1]), as.character(res$Species))
#                     setosa versicolor virginica
#Sepal.Length_mean 5.0060000  5.9360000 6.5880000
#Sepal.Width_mean  3.4280000  2.7700000 2.9740000
#Petal.Length_mean 1.4620000  4.2600000 5.5520000
#Petal.Width_mean  0.2460000  1.3260000 2.0260000
#Sepal.Length_sd   0.3524897  0.5161711 0.6358796
#Sepal.Width_sd    0.3790644  0.3137983 0.3224966
#Petal.Length_sd   0.1736640  0.4699110 0.5518947
#Petal.Width_sd    0.1053856  0.1977527 0.2746501

或者如评论中提到的@Steven Beaupre，可以通过使用spread

重塑来获得输出

library(tidyr)
iris %>% 
   group_by(Species) %>% 
   summarise_each(funs(mean, sd)) %>% 
   gather(key, value, -Species) %>% 
   spread(Species, value)

Answer 2

这是传统的plyr方法。它使用colwise计算所有列的摘要统计信息。

means <- ddply(iris, .(Species), colwise(mean))
sds <- ddply(iris, .(Species), colwise(sd))
merge(means, sds, by = "Species", suffixes = c(".mean", ".sd"))

Answer 3

如果您出于性能原因要使用data.table，可以试试这个（不要害怕 - 比代码更多的评论;-)我试图优化所有性能关键点。

library(data.table)
dt <- as.data.table(iris)

# Helper function similar to "colwise" of package "plyr":
# Apply a function "func" to each column of the data.table "data"
# and append the "suffix" string to the result column name.
colwise.dt <- function( data, func, suffix )
{
  result <- lapply(data, func)                                      # apply the function to each column of the data table
  setDT(result)                                                     # convert the result list into a data table efficiently ("by ref")
  setnames(result, names(result), paste0(names(result), suffix))    # append suffix to each column name efficiently ("by ref"). "setnames" requires a data.table
}

wide.result <- dt[, c(colwise.dt(.SD, mean, ".mean"), colwise.dt(.SD, sd, ".sd")), by=.(Species)]
# Note: .SD is a data.table containing the subset of dt's data for each group (Species), excluding any columns used in "by" (here: Species column)

# Now transpose the result
long.result <- melt(wide.result, id.vars="Species")

# Now transform into one column per group
final.result <- dcast(long.result, variable ~ Species)

wide.result是：

      Species Sepal.Length.mean Sepal.Width.mean Petal.Length.mean Petal.Width.mean Sepal.Length.sd Sepal.Width.sd Petal.Length.sd Petal.Width.sd
1:     setosa             5.006            3.428             1.462            0.246       0.3524897      0.3790644       0.1736640      0.1053856
2: versicolor             5.936            2.770             4.260            1.326       0.5161711      0.3137983       0.4699110      0.1977527
3:  virginica             6.588            2.974             5.552            2.026       0.6358796      0.3224966       0.5518947      0.2746501

long.result是：

       Species          variable     value
 1:     setosa Sepal.Length.mean 5.0060000
 2: versicolor Sepal.Length.mean 5.9360000
 3:  virginica Sepal.Length.mean 6.5880000
 4:     setosa  Sepal.Width.mean 3.4280000
 5: versicolor  Sepal.Width.mean 2.7700000
 6:  virginica  Sepal.Width.mean 2.9740000
 7:     setosa Petal.Length.mean 1.4620000
 8: versicolor Petal.Length.mean 4.2600000
 9:  virginica Petal.Length.mean 5.5520000
10:     setosa  Petal.Width.mean 0.2460000
11: versicolor  Petal.Width.mean 1.3260000
12:  virginica  Petal.Width.mean 2.0260000
13:     setosa   Sepal.Length.sd 0.3524897
14: versicolor   Sepal.Length.sd 0.5161711
15:  virginica   Sepal.Length.sd 0.6358796
16:     setosa    Sepal.Width.sd 0.3790644
17: versicolor    Sepal.Width.sd 0.3137983
18:  virginica    Sepal.Width.sd 0.3224966
19:     setosa   Petal.Length.sd 0.1736640
20: versicolor   Petal.Length.sd 0.4699110
21:  virginica   Petal.Length.sd 0.5518947
22:     setosa    Petal.Width.sd 0.1053856
23: versicolor    Petal.Width.sd 0.1977527
24:  virginica    Petal.Width.sd 0.2746501

final.result是：

            variable    setosa versicolor virginica
1: Sepal.Length.mean 5.0060000  5.9360000 6.5880000
2:  Sepal.Width.mean 3.4280000  2.7700000 2.9740000
3: Petal.Length.mean 1.4620000  4.2600000 5.5520000
4:  Petal.Width.mean 0.2460000  1.3260000 2.0260000
5:   Sepal.Length.sd 0.3524897  0.5161711 0.6358796
6:    Sepal.Width.sd 0.3790644  0.3137983 0.3224966
7:   Petal.Length.sd 0.1736640  0.4699110 0.5518947
8:    Petal.Width.sd 0.1053856  0.1977527 0.2746501

与所需输出的唯一区别是final结果包含名为variable的第一列中的值名称，而不是将其存储在行名称中。这可以通过将行名称设置为第一列并删除第一列来完成...

Answer 4

受到答案的启发，我找到了一个也有效的解决方案，仅使用dplyr和tidyr函数。

require(tidyr)
require(dplyr)

x <- iris %>%
    gather(var, value, -Species)
print(tbl_df(x))

# Compute the mean and sd for each dimension
x <- x %>%
    group_by(Species, var) %>%
    summarise(mean = mean(value), sd = sd(value)) %>%
    ungroup
print(tbl_df(x))

# Convert the data frame from wide form to long form
x <- x %>%
    gather(stat, value, mean:sd)
print(tbl_df(x))

# Combine the variables "var" and "stat" into a single variable
x <- x %>%
    unite(var, var, stat, sep = '.')
print(tbl_df(x))

# Convert the data frame from long form to wide form
x <- x %>%
    spread(Species, value)
print(tbl_df(x))

按列分组，然后计算R中每个其他列的平均值和sd

4 个答案: