我试图在汇总dplyr中的数据框时,为多个因子变量找到组中最常见的值。我需要一个执行以下操作的公式:
有几个公式可行。但是,我能想到的那些都很慢。快速的那些不方便一次应用于数据帧中的几个变量。我想知道是否有人知道一种与dplyr很好地集成的快速方法。
我尝试了以下内容:
生成样本数据(50000组,100个随机字母)
z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))
str(z)
'data.frame': 5000000 obs. of 2 variables:
$ a: int 1 2 3 4 5 6 7 8 9 10 ...
$ b: Factor w/ 26 levels "A","B","C","D",..: 6 4 14 12 3 19 17 19 15 20 ...
&#34;清洁&#34;但慢速接近1
y <- z %>%
group_by(a) %>%
summarise(c = names(table(b))[which.max(table(b))])
user system elapsed
26.772 2.011 29.568
&#34;清洁&#34; -but-slow方法2
y <- z %>%
group_by(a) %>%
summarise(c = names(which(table(b) == max(table(b)))[1]))
user system elapsed
29.329 2.029 32.361
&#34;清洁&#34;但慢速接近3
y <- z %>%
group_by(a) %>%
summarise(c = names(sort(table(b),decreasing = TRUE)[1]))
user system elapsed
35.086 6.905 42.485
&#34;凌乱&#34;但快速接近
y <- z %>%
group_by(a,b) %>%
summarise(counter = n()) %>%
group_by(a) %>%
filter(counter == max(counter))
y <- y[!duplicated(y$a),]
y <- y$counter <- NULL
user system elapsed
7.061 0.330 7.664
答案 0 :(得分:8)
以下是dplyr
的另一个选项:
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE),
stringsAsFactors = FALSE)
a <- z %>% group_by(a, b) %>% summarise(c=n()) %>% filter(row_number(desc(c))==1) %>% .$b
b <- z %>% group_by(a) %>% summarise(c=names(which(table(b) == max(table(b)))[1])) %>% .$c
我们确保这些是等效的方法:
> identical(a, b)
#[1] TRUE
<强>更新强>
正如@docendodiscimus所提到的,你也可以这样做:
count(z, a, b) %>% slice(which.max(n))
以下是基准测试的结果:
library(microbenchmark)
mbm <- microbenchmark(
steven = z %>% group_by(a, b) %>% summarise(c = n()) %>% filter(row_number(desc(c))==1),
phil = z %>% group_by(a) %>% summarise(c = names(which(table(b) == max(table(b)))[1])),
docendo = count(z, a, b) %>% slice(which.max(n)),
times = 10
)
#Unit: seconds
# expr min lq mean median uq max neval cld
# steven 4.752168 4.789564 4.815986 4.813686 4.847964 4.875109 10 b
# phil 15.356051 15.378914 15.467534 15.458844 15.533385 15.606690 10 c
# docendo 4.586096 4.611401 4.669375 4.688420 4.702352 4.753583 10 a
答案 1 :(得分:6)
为什么选择dplyr?
#dummy data
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE))
#result
names(sort(table(z$b),decreasing = TRUE)[1])
# [1] "S"
#time it
system.time(
names(sort(table(z$b),decreasing = TRUE)[1])
)
# user system elapsed
# 0.36 0.00 0.36
编辑:多列
#dummy data
set.seed(123)
z <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE),
c = sample(LETTERS, 5000000, replace = TRUE),
d = sample(LETTERS, 5000000, replace = TRUE))
# check for multiple columns
sapply(c("b","c","d"), function(i)
names(sort(table(z[,i]),decreasing = TRUE)[1])
)
# b c d
#"S" "N" "G"
#time it
system.time(
sapply(c("b","c","d"), function(i)
names(sort(table(z[,i]),decreasing = TRUE)[1]))
)
# user system elapsed
# 0.61 0.17 0.78
答案 2 :(得分:6)
data.table
仍然是最快的选择:
z <- data.frame(a = rep(1:50000,100), b = sample(LETTERS, 5000000, replace = TRUE))
基准:
library(data.table)
library(dplyr)
#dplyr
system.time({
y <- z %>%
group_by(a) %>%
summarise(c = names(which(table(b) == max(table(b)))[1]))
})
user system elapsed
14.52 0.01 14.70
#data.table
system.time(
setDT(z)[, .N, by=b][order(N),][.N,]
)
user system elapsed
0.05 0.02 0.06
#@zx8754 's way - base R
system.time(
names(sort(table(z$b),decreasing = TRUE)[1])
)
user system elapsed
0.73 0.06 0.81
使用data.table可以看出:
setDT(z)[, .N, by=b][order(N),][.N,]
或
#just to get the name
setDT(z)[, .N, by=b][order(N),][.N, b]
似乎是最快的
所有列的更新:
使用@ zx8754的数据
set.seed(123)
z2 <- data.frame(a = rep(1:50000,100),
b = sample(LETTERS, 5000000, replace = TRUE),
c = sample(LETTERS, 5000000, replace = TRUE),
d = sample(LETTERS, 5000000, replace = TRUE))
你可以这样做:
#with data.table
system.time(
sapply(c('b','c','d'), function(x) {
data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x]
}))
user system elapsed
0.34 0.00 0.34
#with base-R
system.time(
sapply(c("b","c","d"), function(i)
names(sort(table(z2[,i]),decreasing = TRUE)[1]))
)
user system elapsed
4.14 0.11 4.26
只是为了确认结果是一样的:
sapply(c('b','c','d'), function(x) {
data.table(x = z2[[x]])[, .N, by=x][order(N),][.N, x]
})
b c d
S N G
sapply(c("b","c","d"), function(i)
names(sort(table(z2[,i]),decreasing = TRUE)[1]))
b c d
"S" "N" "G"
答案 3 :(得分:4)
按照LyzandeR的建议,我会再添加一个答案:
require(data.table)
setDT(z)[, .N, by=.(a,b)][order(-N), .(b=b[1L]), keyby=a]