在dplyr

时间:2015-09-24 16:20:54

标签: r performance dplyr

我试图在汇总dplyr中的数据框时,为多个因子变量找到组中最常见的值。我需要一个执行以下操作的公式:

  1. 在一个组中的一个变量的所有因子中查找最常用的因子水平(所以基本上" max()"表示因子水平的计数。)
  2. 如果几个最常用因子级别之间存在联系,请选择其中任何一个因素级别。
  3. 返回因子级别名称(不是计数)。
  4. 有几个公式可行。但是,我能想到的那些都很慢。快速的那些不方便一次应用于数据帧中的几个变量。我想知道是否有人知道一种与dplyr很好地集成的快速方法。

    我尝试了以下内容:

    生成样本数据(50000组,100个随机字母)

    z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))
    
    str(z)
    'data.frame':   5000000 obs. of  2 variables:
    $ a: int  1 2 3 4 5 6 7 8 9 10 ...
    $ b: Factor w/ 26 levels "A","B","C","D",..: 6 4 14 12 3 19 17 19 15 20 ...
    

    &#34;清洁&#34;但慢速接近1

     y <- z %>% 
        group_by(a) %>% 
        summarise(c = names(table(b))[which.max(table(b))])
    
    user    system  elapsed 
    26.772  2.011   29.568 
    

    &#34;清洁&#34; -but-slow方法2

    y <- z %>% 
        group_by(a) %>% 
        summarise(c = names(which(table(b) == max(table(b)))[1]))
    
    user    system  elapsed 
    29.329  2.029   32.361 
    

    &#34;清洁&#34;但慢速接近3

    y <- z %>% 
        group_by(a) %>% 
        summarise(c = names(sort(table(b),decreasing = TRUE)[1]))
    
    user    system  elapsed 
    35.086  6.905   42.485 
    

    &#34;凌乱&#34;但快速接近

    y <- z %>% 
         group_by(a,b) %>% 
         summarise(counter = n()) %>% 
         group_by(a) %>% 
         filter(counter == max(counter))
    y <- y[!duplicated(y$a),]
    y <- y$counter <- NULL
    
    user   system  elapsed 
    7.061  0.330   7.664 
    

4 个答案:

答案 0 :(得分:8)

以下是dplyr的另一个选项:

set.seed(123)
z <- data.frame(a = rep(1:50000,100), 
                b = sample(LETTERS, 5000000, replace = TRUE), 
                stringsAsFactors = FALSE)

a <- z %>% group_by(a, b) %>% summarise(c=n()) %>% filter(row_number(desc(c))==1) %>% .$b 
b <- z %>% group_by(a) %>% summarise(c=names(which(table(b) == max(table(b)))[1])) %>% .$c 

我们确保这些是等效的方法:

> identical(a, b)
#[1] TRUE

<强>更新

正如@docendodiscimus所提到的,你也可以这样做:

count(z, a, b) %>% slice(which.max(n))

以下是基准测试的结果:

library(microbenchmark)
mbm <- microbenchmark(
  steven = z %>% group_by(a, b) %>% summarise(c = n()) %>% filter(row_number(desc(c))==1),
  phil = z %>% group_by(a) %>% summarise(c = names(which(table(b) == max(table(b)))[1])),
  docendo = count(z, a, b) %>% slice(which.max(n)),
  times = 10
)

enter image description here

#Unit: seconds
#    expr       min        lq      mean    median        uq       max neval cld
#  steven  4.752168  4.789564  4.815986  4.813686  4.847964  4.875109    10  b 
#    phil 15.356051 15.378914 15.467534 15.458844 15.533385 15.606690    10   c
# docendo  4.586096  4.611401  4.669375  4.688420  4.702352  4.753583    10 a 

答案 1 :(得分:6)

为什么选择dplyr?

#dummy data
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
                b = sample(LETTERS, 5000000, replace = TRUE))

#result
names(sort(table(z$b),decreasing = TRUE)[1])
# [1] "S"

#time it
system.time(
  names(sort(table(z$b),decreasing = TRUE)[1])
)

# user  system elapsed 
# 0.36    0.00    0.36 

编辑:多列

#dummy data
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
                b = sample(LETTERS, 5000000, replace = TRUE),
                c = sample(LETTERS, 5000000, replace = TRUE),
                d = sample(LETTERS, 5000000, replace = TRUE))

# check for multiple columns
sapply(c("b","c","d"), function(i)
  names(sort(table(z[,i]),decreasing = TRUE)[1])
  )
# b   c   d 
#"S" "N" "G" 

#time it
system.time(
  sapply(c("b","c","d"), function(i)
    names(sort(table(z[,i]),decreasing = TRUE)[1]))
  )
# user  system elapsed 
# 0.61    0.17    0.78 

答案 2 :(得分:6)

data.table仍然是最快的选择:

z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))

基准:

library(data.table)
library(dplyr)

#dplyr
system.time({
  y <- z %>% 
    group_by(a) %>% 
    summarise(c = names(which(table(b) == max(table(b)))[1]))  
})
 user  system elapsed 
14.52    0.01   14.70 

#data.table
system.time(
  setDT(z)[, .N, by=b][order(N),][.N,]
)
 user  system elapsed 
 0.05    0.02    0.06 

#@zx8754 's way - base R
system.time(
  names(sort(table(z$b),decreasing = TRUE)[1])
)
   user  system elapsed 
   0.73    0.06    0.81 

使用data.table可以看出:

  setDT(z)[, .N, by=b][order(N),][.N,]

  #just to get the name
  setDT(z)[, .N, by=b][order(N),][.N, b] 

似乎是最快的

所有列的更新:

使用@ zx8754的数据

set.seed(123)
z2 <- data.frame(a = rep(1:50000,100),
                b = sample(LETTERS, 5000000, replace = TRUE),
                c = sample(LETTERS, 5000000, replace = TRUE),
                d = sample(LETTERS, 5000000, replace = TRUE))

你可以这样做:

#with data.table
system.time(
 sapply(c('b','c','d'), function(x) {
  data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x] 
 }))
 user  system elapsed 
 0.34    0.00    0.34 

#with base-R
system.time(
  sapply(c("b","c","d"), function(i)
    names(sort(table(z2[,i]),decreasing = TRUE)[1]))
)
 user  system elapsed 
 4.14    0.11    4.26 

只是为了确认结果是一样的:

sapply(c('b','c','d'), function(x) {
  data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x] 
})
b c d 
S N G 

sapply(c("b","c","d"), function(i)
    names(sort(table(z2[,i]),decreasing = TRUE)[1]))
b   c   d 
"S" "N" "G" 

答案 3 :(得分:4)

按照LyzandeR的建议,我会再添加一个答案:

require(data.table)
setDT(z)[, .N, by=.(a,b)][order(-N), .(b=b[1L]), keyby=a]