Question

我有一个大型数据集，我想重塑和聚合，并遇到一些困难。

例如，请考虑以下内容，其中引号将数据库中的列分开。因素包括year，habitat，site和replicate。字段species和biomass是数据：

"year" "habitat" "site" "replicate" "species" "biomass"
 2010    inner       a          1       sp.1        10
 2010    inner       a          1       sp.3         6
 2010    inner       a          1       sp.1         5
 2010    inner       a          1       sp.2         8
 2010    inner       a          1       sp.4         4
 2010    inner       a          1       sp.5         7
 2010    inner       a          2       sp.3         5
 2010    inner       a          2       sp.2         6
 2010    inner       a          2       sp.5         2
 2010    inner       a          2       sp.1         5
 2010    inner       a          3       sp.4         5
 2010    inner       a          3       sp.3         4
 2010    inner       a          3       sp.6         8
 2010    inner       a          3       sp.2         5
 2010    outer       b          1       sp.1         6
 2010    outer       b          1       sp.3         9
 2010    outer       b          1       sp.3         3
 2010    outer       b          1       sp.2         6
 2010    outer       b          2       sp.5         4
 2010    outer       b          2       sp.1         5
 2010    outer       b          2       sp.1         7
 2010    outer       b          2       sp.2         5
 2010    outer       b          3       sp.4         2
 2010    outer       b          3       sp.6         5
 2010    outer       b          3       sp.2         4
 2010    outer       b          3       sp.1         4

我想要做的是重塑和聚合我的桌子，同时保留所有因素。因此，species乘site矩阵，能够将因子和聚合保持在复制水平。我在下面列出了我想要的示例图片，其中数据使用Excel数据透视表填充了“sum”函数。在使用dplyr和群组的 R 中是否有一个简单的代码？ tapply和aggregate？

year    habitat site    replicate   sp.1    sp.2    sp.3    sp.4    sp.5    sp.6
2010    inner    a          1        15      8       6       4       7       0
2010    inner    a          3        0       5       4       5       0       8
2010    outer    b          1        6       6       12      0       0       0
2010    outer    b          2        12      5       0       0       4       0
2010    outer    b          3        4       4       0       2       0       5

Answer 1

首先，如果你有一个大数据集我建议使用数据表。

我经常使用“BreakBuild”功能，但我无法在你的代码上测试它。

library(data.table)
mydata<-data.table(read.table("mydata"))

aggmydata<- mydata[,list(biomass=sum(biomass)),by=list(year,habitat,site,replicate,species)]

BreakBuild <- function (df,BCol,IDCols,ValCols) {
  setkeyv(df,BCol)
  NewCols <- unique(df[[BCol]]); #Scan the column for unique entries.  Each entry gets a data table.
  ldt <- list(); #List that will hold a data table for each break.
  ColList <-c(IDCols,ValCols) # List of columns, does not include broken collumn.
  for (bframe in NewCols) {
    ldt[[bframe]] = df[bframe,ColList, with=FALSE] #Create and store a data table with columns from IDCols and VolCols.
    if(length(ValCols)>1){setnames(ldt[[bframe]], ValCols, paste(bframe,ValCols,sep="."))} #Prefix the Value columns with the name of the break.
    else {setnames(ldt[[bframe]], ValCols, bframe)}  #If there is only one Value Column, give it the name of the break.
  }
  Build<-Reduce(function(...) merge(...,by=IDCols,all=T),ldt)
  return(Build)
}

waggmydata<- BreakBuild(aggmydata,BCol="species",
                        IDCols=c("year","habitat","site","replicate"),
                        ValCols="biomass")

你也可以使用reshape2和dcast，虽然我并不熟悉dcast的语法。这就像是

library(reshape2)
waggmydata<-dcast.data.table(mydata,  year+habitat+site+replicate~ species, fun=sum)

如果您的数据集非常大（数百万行），那么使用data.table可能是值得的，尽管看起来dplyr自上次尝试以来已经取得了重大进展。

以下是三种测试方法的基准测试结果。

require(data.table)
require(reshape2)
require(dplyr)
require(tidyr)
require(microbenchmark)
BreakBuild <- function (df,BCol,IDCols,ValCols) {
  setkeyv(df,BCol)
  NewCols <- unique(df[[BCol]]); #Scan the column for unique entries.  Each entry gets a data table.
  ldt <- list(); #List that will hold a data table for each break.
  ColList <-c(IDCols,ValCols) # List of columns, does not include broken collumn.
  for (bframe in NewCols) {
    ldt[[bframe]] = df[bframe,ColList, with=FALSE] #Create and store a data table with columns from IDCols and VolCols.
    if(length(ValCols)>1){setnames(ldt[[bframe]], ValCols, paste(bframe,ValCols,sep="."))} #Prefix the Value columns with the name of the break.
    else {setnames(ldt[[bframe]], ValCols, bframe)}  #If there is only one Value Column, give it the name of the break.
  }
  Build<-Reduce(function(...) merge(...,by=IDCols,all=T),ldt)
  return(Build)
}

mydata<-data.table(year=sample(c("2010","2011","2012"),1e6,replace=T),
                   habitat=sample(c("inner","outer"),1e6,replace=T),
                   site=sample(letters[1:15],1e6,replace=T),
                   replicate=sample(1:5,1e6,replace=T),
                   species=sample(paste("sp",1:20,sep="."),1e6,replace=T),
                   biomass=sample(1:30,1e6,replace=T))
dat1<- as.data.frame(mydata)


microbenchmark(
  DPLYR= {
    dat1%>% 
      group_by(year, habitat,site,replicate, species)%>%
      summarise(biomass=sum(biomass)) %>%
      spread(species, biomass,fill=0)
  },
  DATATABLE = {
    aggmydata<- mydata[,list(biomass=sum(biomass)),by=list(year,habitat,site,replicate,species)]
    waggmydata<- BreakBuild(aggmydata,BCol="species",
                            IDCols=c("year","habitat","site","replicate"),
                            ValCols="biomass")
  },
  DCAST.DATA.TABLE = {
    waggmydata<-dcast.data.table(mydata,  year+habitat+site+replicate~ species, fun=sum)
  }
  )

##             expr       min       lq   median       uq      max neval
##            DPLYR 168.26559 170.3902 171.9306 173.9712 189.5266   100
##        DATATABLE  97.21738 101.1543 103.5157 108.2527 125.9114   100
## DCAST.DATA.TABLE 184.58250 189.4021 192.0251 195.3731 242.9994   100

一旦你知道了语法，我就说DCAST.DATA.TABLE是这个例子中最容易编码的。

Answer 2

使用dplyr（如果dat1是数据集）

  library(dplyr)
  library(tidyr)

  dat1%>% 
  group_by(year, habitat,site,replicate, species)%>%
  summarise(biomass=sum(biomass)) %>%
  spread(species, biomass,fill=0)
  # Source: local data frame [6 x 10]

  #  year habitat site replicate sp.1 sp.2 sp.3 sp.4 sp.5 sp.6
  #1 2010   inner    a         1   15    8    6    4    7    0
  #2 2010   inner    a         2    5    6    5    0    2    0
  #3 2010   inner    a         3    0    5    4    5    0    8
  #4 2010   outer    b         1    6    6   12    0    0    0
  #5 2010   outer    b         2   12    5    0    0    4    0
  #6 2010   outer    b         3    4    4    0    2    0    5

BTW：replicate 2栖息地inner没有显示在预期的输出中。

数据重塑和聚合大型数据库

2 个答案: