复杂的重塑

时间:2013-03-03 05:50:33

标签: r dataframe reshape2

我想将数据帧从长格式转换为宽格式,并且我放弃了一些我想要保留的数据。 对于以下示例:

df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")),
                 Par2 = unlist(strsplit("DDEEFFF","")),
                 ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")),
                 Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")),
                 Val = c(10,20,30,40,50,60,70))

   #     Par1 Par2 ParD Type Val
   #   1    A    D  foo  pre  10
   #   2    A    D  bar post  20
   #   3    B    E  baz  pre  30
   #   4    B    E  qux post  40
   #   5    C    F  bla  pre  50
   #   6    C    F  xyz post  60
   #   7    C    F  meh post  70

dfw <- dcast(df,
             formula = Par1 + Par2 ~ Type,
             value.var = "Val",
             fun.aggregate = mean)

 #     Par1 Par2 post pre
 #   1    A    D   20  10
 #   2    B    E   40  30
 #   3    C    F   65  50

这几乎是我的需要,但我想拥有

  1. 一些字段保存来自ParD字段的数据(例如,作为单个合并字符串),
  2. 用于聚合的观察次数。
  3. 即。我希望得到的data.frame如下:

        #     Par1 Par2 post pre Num.pre Num.post ParD
        #   1    A    D   20  10      1      1    foo_bar 
        #   2    B    E   40  30      1      1    baz_qux
        #   3    C    F   65  50      1      2    bla_xyz_meh
    

    我会感激任何想法。例如,我尝试通过写入dcast来解决第二个任务:fun.aggregate=function(x) c(Val=mean(x),Num=length(x)) - 但这会导致错误。

8 个答案:

答案 0 :(得分:14)

晚会,但这是使用data.table的另一种选择:

require(data.table)
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]), 
          post=mean(Val[Type == "post"]), 
          pre.num=length(Val[Type == "pre"]), 
          post.num=length(Val[Type == "post"]), 
          ParD = paste(ParD, collapse="_")), 
by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

[来自Matthew] +1一些小的改进,以保存重复相同的==,并在j内展示局部变量。

dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
          post=mean(Val[.post <- Type=="post"]),  # save .post
          pre.num=sum(.pre),                      # reuse .pre
          post.num=sum(.post),                    # reuse .post
          ParD = paste(ParD, collapse="_")), 
by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
       .post <- Type=="post"
       list(pre=mean(Val[.pre]), 
            post=mean(Val[.post]),
            pre.num=sum(.pre),
            post.num=sum(.post), 
            ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

如果list列可以而不是paste,那么这应该更快:

dt[, { .pre <- Type=="pre"
       .post <- Type=="post"
       list(pre=mean(Val[.pre]), 
            post=mean(Val[.post]),
            pre.num=sum(.pre),
            post.num=sum(.post), 
            ParD = list(ParD)) }     # list() faster than paste()
, by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo,bar
# 2:    B    E  30   40       1        1     baz,qux
# 3:    C    F  50   65       1        2 bla,xyz,meh

答案 1 :(得分:13)

使用ddply分两步解决(我不满意,但我得到了结果)

dat <- ddply(df,.(Par1,Par2),function(x){
  data.frame(ParD=paste(paste(x$ParD),collapse='_'),
             Num.pre =length(x$Type[x$Type =='pre']),
             Num.post = length(x$Type[x$Type =='post']))
})

merge(dfw,dat)
 Par1 Par2 post pre        ParD Num.pre Num.post
1    A    D  2.0   1     foo_bar       1        1
2    B    E  4.0   3     baz_qux       1        1
3    C    F  6.5   5 bla_xyz_meh       1        2

答案 2 :(得分:6)

我会发帖但是agstudy让我感到羞耻:

step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
    piece1 <- tapply(x$Val, x$Type, mean)
    piece2 <- tapply(x$Type, x$Type, length)
    names(piece2) <- paste0("Num.", names(piece2))
    out <- x[1, 1:2]
    out[, 3:6] <- c(piece1, piece2)
    names(out)[3:6] <-  names(c(piece1, piece2))
    out$ParD <- paste(unique(x$ParD), collapse="_")
    out
})
data.frame(do.call(rbind, step3), row.names=NULL)

产量:

  Par1 Par2 post pre Num.post Num.pre        ParD
1    A    D  2.0   1        1       1     foo_bar
2    B    E  4.0   3        1       1     baz_qux
3    C    F  6.5   5        2       1 bla_xyz_meh

答案 3 :(得分:6)

你可以合并两个dcast和一个聚合,这里所有包装成一个大的表达式,主要是为了避免中间的物体随后徘徊:

Reduce(merge, list(
    dcast(df, formula = Par1+Par2~Type, value.var="Val",
        fun.aggregate=mean),
    setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
        fun.aggregate=length), c("Par1", "Par2", "Num.post",
        "Num.pre")),
    aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
    ))

答案 4 :(得分:6)

基准测试真是太棒了! 以下是plyr方法(由@agstudy建议)与data.table方法(由@Arun建议)的一些运行 使用不同的样本量(N = 900,2700,10800)

<强>要点:
data.table方法优于plyr方法7.5倍

#-------------------#
#   M E T H O D S   #
#-------------------#

  # additional methods below, in the updates

  # Method 1  -- suggested by @agstudy
  plyrMethod <- quote({
                  dfw<-dcast(df,
                         formula = Par1+Par2~Type,
                         value.var="Val",
                         fun.aggregate=mean)
                  dat <- ddply(df,.(Par1,Par2),function(x){
                    data.frame(ParD=paste(paste(x$ParD),collapse='_'),
                               Num.pre =length(x$Type[x$Type =='pre']),
                               Num.post = length(x$Type[x$Type =='post']))
                  })
                  merge(dfw,dat)
                })

  # Method 2 -- suggested by @Arun
  dtMethod <- quote(
                dt[, list(pre=mean(Val[Type == "pre"]), 
                          post=mean(Val[Type == "post"]), 
                          Num.pre=length(Val[Type == "pre"]), 
                          Num.post=length(Val[Type == "post"]), 
                          ParD = paste(ParD, collapse="_")), 
                by=list(Par1, Par2)]
              ) 

 # Method 3 -- suggested by @regetz
 reduceMethod <- quote(
                  Reduce(merge, list(
                      dcast(df, formula = Par1+Par2~Type, value.var="Val",
                          fun.aggregate=mean),
                      setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
                          fun.aggregate=length), c("Par1", "Par2", "Num.post",
                          "Num.pre")),
                      aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
                      ))
                  )

 # Method 4 -- suggested by @Ramnath
 castddplyMethod <- quote(
                      reshape::cast(Par1 + Par2 + ParD ~ Type, 
                           data = ddply(df, .(Par1, Par2), transform, 
                           ParD = paste(ParD, collapse = "_")), 
                           fun  = c(mean, length)
                          )
                      )



# SAMPLE DATA #
#-------------#

library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)


  # for Par1, ParD
  LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
  lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")

  # max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
  size <- 17568
  size <- 10800
  size <- 900  

  set.seed(1)
  df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                 , Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                 , ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
                 , Type=rep(c("pre","post"), size/2)
                 , Val =sample(seq(10,100,10), size, TRUE)
                 )

  dt <- data.table(df, key=c("Par1", "Par2"))


# Confirming Same Results # 
#-------------------------#
  # Evaluate
  DF1 <- eval(plyrMethod)
  DF2 <- eval(dtMethod)

  # Convert to DF and sort columns and sort ParD levels, for use in identical
  colOrder <- sort(names(DF1))
  DF1 <- DF1[, colOrder]
  DF2 <- as.data.frame(DF2)[, colOrder]
  DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
  identical((DF1), (DF2))
  # [1] TRUE
#-------------------------#

结果

#--------------------#
#     BENCHMARK      #
#--------------------#
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
          replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
          order="relative")


# SAMPLE SIZE = 900
  relative      test elapsed user.self sys.self replications
     1.000    reduce   0.392     0.375    0.018            5
     1.003        dt   0.393     0.377    0.016            5
     7.064      plyr   2.769     2.721    0.047            5
     8.003 castddply   3.137     3.030    0.106            5

# SAMPLE SIZE = 2,700
  relative   test elapsed user.self sys.self replications
     1.000     dt   1.371     1.327    0.090            5
     2.205 reduce   3.023     2.927    0.102            5
     7.291   plyr   9.996     9.644    0.377            5

# SAMPLE SIZE = 10,800
  relative      test elapsed user.self sys.self replications
     1.000        dt   8.678     7.168    1.507            5
     2.769    reduce  24.029    23.231    0.786            5
     6.946      plyr  60.277    52.298    7.947            5
    13.796 castddply 119.719   113.333   10.816            5

# SAMPLE SIZE = 17,568
  relative   test elapsed user.self sys.self replications
     1.000     dt  27.421    13.042   14.470            5
     4.030 reduce 110.498    75.853   34.922            5
     5.414   plyr 148.452   105.776   43.156            5

更新:为baseMethod1

添加了结果
# Used only sample size of 90, as it was taking long
relative  test elapsed user.self sys.self replications
   1.000    dt   0.044     0.043    0.001            5
   7.773  plyr   0.342     0.339    0.003            5
  65.614 base1   2.887     2.866    0.028            5

Where
   baseMethod1 <- quote({
                  step1 <- with(df, split(df, list(Par1, Par2)))
                  step2 <- step1[sapply(step1, nrow) > 0]
                  step3 <- lapply(step2, function(x) {
                      piece1 <- tapply(x$Val, x$Type, mean)
                      piece2 <- tapply(x$Type, x$Type, length)
                      names(piece2) <- paste0("Num.", names(piece2))
                      out <- x[1, 1:2]
                      out[, 3:6] <- c(piece1, piece2)
                      names(out)[3:6] <-  names(c(piece1, piece2))
                      out$ParD <- paste(unique(x$ParD), collapse="_")
                      out
                  })
                  data.frame(do.call(rbind, step3), row.names=NULL)
                })

更新2:添加了将DT作为指标

的一部分进行键控

根据@MatthewDowle的评论,将索引步骤添加到公平基准中。
但是,假设使用data.table,它将取代data.frame和 因此,索引将发生一次,而不仅仅是针对此过程

   dtMethod.withkey <- quote({
                       dt <- data.table(df, key=c("Par1", "Par2"))       
                       dt[, list(pre=mean(Val[Type == "pre"]), 
                                 post=mean(Val[Type == "post"]), 
                                 Num.pre=length(Val[Type == "pre"]), 
                                 Num.post=length(Val[Type == "post"]), 
                                 ParD = paste(ParD, collapse="_")), 
                       by=list(Par1, Par2)]
                     }) 

# SAMPLE SIZE = 10,800
  relative       test elapsed user.self sys.self replications
     1.000         dt   9.155     7.055    2.137            5
     1.043 dt.withkey   9.553     7.245    2.353            5
     3.567     reduce  32.659    31.196    1.586            5
     6.703       plyr  61.364    54.080    7.600            5

更新3:将@ MD的编辑基准测试为@ Arun的原始答案

dtMethod.MD1 <- quote(
                  dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
                            post=mean(Val[.post <- Type=="post"]),  # save .post
                            pre.num=sum(.pre),                      # reuse .pre
                            post.num=sum(.post),                    # reuse .post
                            ParD = paste(ParD, collapse="_")), 
                     by=list(Par1, Par2)]
                  )

dtMethod.MD2 <- quote(
                  dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
                         .post <- Type=="post"
                         list(pre=mean(Val[.pre]), 
                              post=mean(Val[.post]),
                              pre.num=sum(.pre),
                              post.num=sum(.post), 
                              ParD = paste(ParD, collapse="_")) }
                  , by=list(Par1, Par2)]
                  )

dtMethod.MD3 <- quote(
                dt[, { .pre <- Type=="pre"
                       .post <- Type=="post"
                       list(pre=mean(Val[.pre]), 
                            post=mean(Val[.post]),
                            pre.num=sum(.pre),
                            post.num=sum(.post), 
                            ParD = list(ParD)) }     # list() faster than paste()
                , by=list(Par1, Par2)]
                )

benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
      replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
      order="relative")

#--------------------#

Comparing the different data.table methods amongst themselves


# SAMPLE SIZE = 900
  relative  test elapsed user.self sys.self replications
     1.000 dt.M3   0.198     0.197    0.001            5  <~~~ "list()" Method
     1.242 dt.M1   0.246     0.243    0.004            5
     1.253 dt.M2   0.248     0.242    0.007            5
     1.884    dt   0.373     0.367    0.007            5

# SAMPLE SIZE = 17,568
  relative  test elapsed user.self sys.self replications
     1.000 dt.M3  33.492    24.487    9.122            5   <~~~ "list()" Method
     1.086 dt.M1  36.388    11.442   25.086            5
     1.086 dt.M2  36.388    10.845   25.660            5
     1.126    dt  37.701    13.256   24.535            5

Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session  (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
       Then re-ran in the *same* session, with reps=5. Results very different._


benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
# SAMPLE SIZE=17,568;  CLEAN SESSION
  relative  test elapsed user.self sys.self replications
     1.000 dt.M1   8.885     4.260    4.617            1
     1.633 dt.M3  14.506    12.821    1.677            1

# SAMPLE SIZE=17,568;  *SAME* SESSION
  relative  test elapsed user.self sys.self replications
     1.000 dt.M1  33.443    10.200   23.226            5
     1.048 dt.M3  35.060    26.127    8.915            5

#--------------------#

New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_

# SAMPLE SIZE = 900
  relative   test elapsed user.self sys.self replications
     1.000  dt.M1   0.254     0.247    0.008            5
     1.705 reduce   0.433     0.425    0.010            5
    11.280   plyr   2.865     2.842    0.031            5

# SAMPLE SIZE = 17,568
  relative   test elapsed user.self sys.self replications
     1.000  dt.M1  24.826    10.427   14.458            5
     4.348 reduce 107.935    70.107   38.314            5
     5.942   plyr 147.508   106.958   41.083            5

答案 5 :(得分:2)

reshape::castplyr::ddply

相结合的一步解决方案
cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform, 
  ParD = paste(ParD, collapse = "_")), fun  = c(mean, length))

请注意,dcast中的reshape2函数不允许传递多个聚合函数,而cast中的reshape函数则会传递。{/ p>

答案 6 :(得分:2)

我相信这个基础R解决方案可与@ Arun的数据表解决方案相媲美。 (这并不是说我更喜欢它;那个代码更简单!)

baseMethod2 <- quote({
    is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="\b"))))
    i1 <- sapply(is, `[`, 1)
    out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
    js <- lapply(is, function(i) split(i, df$Type[i]))
    out$post <- sapply(js, function(j) mean(df$Val[j$post]))
    out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
    out$Num.pre <- sapply(js, function(j) length(j$pre))
    out$Num.post <- sapply(js, function(j) length(j$post))
    out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
    out
})

分别使用@ RicardoSaporta的时序码900,2700和10,800:

> relative        test elapsed user.self sys.self replications
3    1.000 baseMethod2   0.230     0.229        0            5
1    1.130          dt   0.260     0.257        0            5
2    8.752        plyr   2.013     2.006        0            5

> relative        test elapsed user.self sys.self replications
3    1.000 baseMethod2   0.877     0.872        0            5
1    1.068          dt   0.937     0.934        0            5
2    8.060        plyr   7.069     7.043        0            5

> relative        test elapsed user.self sys.self replications
1    1.000          dt   6.232     6.178    0.031            5
3    1.085 baseMethod2   6.763     6.683    0.054            5
2    7.263        plyr  45.261    44.983    0.104            5

答案 7 :(得分:0)

尝试将不同的聚合表达式包装到一个自包含的函数中(表达式应该产生原子值)...

multi.by <- function(X, INDEX,...) {
    expressions <- substitute(...())
    duplicates <- duplicated(INDEX)
    res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part) 
        sapply(expressions,eval,part,simplify=F),simplify=F))
    if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
    else rownames(res) <- INDEX[!duplicates]
    res
}

multi.by(df,df[,1:2],
    pre=mean(Val[Type=="pre"]), 
    post=mean(Val[Type=="post"]),
    Num.pre=sum(Type=="pre"),
    Num.post=sum(Type=="post"),
    ParD=paste(ParD, collapse="_"))