如何使用ddply以编程方式汇总多列?

时间:2015-08-21 18:11:53

标签: r parsing eval plyr

是否可以使用函数的参数指定要与ddply聚合的列,而不使用eval + parse?这是我到目前为止所拥有的:

x <- c(2,4,3,1,5,7)
y <- c(3,2,6,3,4,6)
group1 <- c("A","A","A","A","B","B")
group2 <- c("X","X","Y","Y","Z","X")

data <- data.frame(group1, group2, x, y)

下面是我想要输出的内容:

aggFunction <- function(dataframe, toAverage, toGroup) {
  out <- ddply(dataframe, toGroup, summarise, 
               x = mean(x),
               y = mean(y))
  return(out)
}

aggFunction(data, c("x", "y"), c("group1", "group2"))

# group1 group2 x   y
# 1      A      X 3 2.5
# 2      A      Y 2 4.5
# 3      B      X 7 6.0
# 4      B      Z 5 4.0

这是我使用解析(eval)的解决方案:

aggFunction <- function(dataframe, toAverage, toGroup) {

  toAverageArgs <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
  out <- eval(parse(text = paste("ddply(dataframe, toGroup, here(summarize),", toAverageArgs, ")")))

  return(out)
}

这给了我想要的输出。

我想知道是否有更好的方法来做到这一点。我知道使用do.call()和get(),但我没有尝试过这些。

继承人的企图; get(string)没有用,但是这里(总结)让我得到字符串值。不幸的是,这意味着ddply将它们视为字符串:

aggFunction <- function(dataframe, toAverage, toGroup) {

  string <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
  out <- ddply(dataframe, toGroup, here(summarise), string)

  return(out)
}

aggFunction(data, c("x", "y"), c("group1", "group2"))

# group1 group2                      ..2
# 1      A      X x = mean(x), y = mean(y)
# 2      A      Y x = mean(x), y = mean(y)
# 3      B      X x = mean(x), y = mean(y)
# 4      B      Z x = mean(x), y = mean(y)

也试过do.call,但它们仍被视为字符串:

aggFunction <- function(dataframe, toAverage, toGroup) {

  string <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
  print(string)

  args <- list(dataframe, toGroup, here(summarise), string)
  out <- do.call(ddply, args)

  return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))

# group1 group2 "x = mean(x), y = mean(y)"
# 1      A      X   x = mean(x), y = mean(y)
# 2      A      Y   x = mean(x), y = mean(y)
# 3      B      X   x = mean(x), y = mean(y)
# 4      B      Z   x = mean(x), y = mean(y)

最后我尝试使用mean()进行硬编码,但后来我无法设置列名。如果我使用get(testVar)= mean(get(testVar))我会得到意外=。

aggFunction <- function(dataframe, toAverage, toGroup) {

  testVar <- "x"

  out <- ddply(dataframe, toGroup, here(summarise), 
           get(testVar) = mean(get(testVar))
           ## 
  return(out)
}

3 个答案:

答案 0 :(得分:3)

在基础R中使用aggregate

aggFunction <- function(dataframe, toAverage, toGroup) {
  aggregate(dataframe[, toAverage], dataframe[, toGroup], mean)
}

aggFunction(data, c("x", "y"), c("group1", "group2"))

   group1 group2 x   y
1      A      X 3 2.5
2      B      X 7 6.0
3      A      Y 2 4.5
4      B      Z 5 4.0

答案 1 :(得分:1)

您可以考虑dplyr包 - 通常它比plyr快得多,而且语法也很漂亮。

library(dplyr)

x <- c(2,4,3,1,5,7)
y <- c(3,2,6,3,4,6)
group1 <- c("A","A","A","A","B","B")
group2 <- c("X","X","Y","Y","Z","X")

aggFunction <- function(dataframe, toAverage, toGroup) {
  dataframe %>%
    group_by_(.dots = toGroup) %>%
    summarise_(.dots = setNames(sprintf("mean(%s)", toAverage), toAverage))
}

data <- data.frame(group1, group2, x, y)
aggFunction(data, c("x", "y"), c("group1", "group2"))

它给出了:

  group1 group2 x   y
1      A      X 3 2.5
2      A      Y 2 4.5
3      B      X 7 6.0
4      B      Z 5 4.0

答案 2 :(得分:0)

如果先熔化数据框,以长格式进行计算,然后再回滚,这会容易得多。

library(reshape2)
library(plyr)
aggFunction <- function(d1, toAverage, toGroup) {
    d2 <- melt(d1, id.vars=toGroup, measure.vars=toAverage)
    d3 <- ddply(d2, ~group1 + group2 + variable, summarize, mean=mean(value))
    dcast(d3, group1 + group2 ~ variable, value.var="mean")
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
##   group1 group2 x   y
## 1      A      X 3 2.5
## 2      A      Y 2 4.5
## 3      B      X 7 6.0
## 4      B      Z 5 4.0