快速条件选择有更快的方法吗? 也许更好的将data.frame转换为另一种类型? 在这个测试版本中,我有大约700k行,但可能有数百万?
我对基准测试感到疑惑,因为一切都在记忆中。 替代方案可能是通过db进行一些额外的工作(ddl,索引)。
> str(df.test)
'data.frame': 694118 obs. of 4 variables:
$ uid : chr "ZyVOZrPOXwkuGSPv" "qBwuxhbrszRcISSRmIlYaQXHRUZE" "azCESULsUinrAeFkGIjEZpOLhrJcnB" "yLXPfpGlnLrtKmCRERj" ...
$ g1 : chr "group_70" "group_85" "group_150" "group_32" ...
$ g2 : chr "D" "A" "A" "C" ...
$ value: num 0.7756 0.1389 0.8924 0.2278 0.0709 ...
> df.test[200,]
uid g1 g2 value
200 appoBThmLxqFTyjFWyAqzsyJh group_2 E 0.604
>
> benchmark(replications = 100,df.test[(df.test$uid=='appoBThmLxqFTyjFWyAqzsyJh') &
+ (df.test$g1 == 'group_2') &
+ (df.test$g2 == 'E'),'value'])
test replications elapsed relative user.self sys.self user.child sys.child
1 df.test[(df.test$uid == "appoBThmLxqFTyjFWyAqzsyJh") & (df.test$g1 == "group_2") & (df.test$g2 == "E"), "value"] 100 10.72 1 10.713 0.007 0 0
>
> benchmark(replications = 100,subset(df.test,uid=='appoBThmLxqFTyjFWyAqzsyJh' & g1 == 'group_2' & g2== 'E' ))
test replications elapsed relative user.self sys.self user.child sys.child
1 subset(df.test, uid == "appoBThmLxqFTyjFWyAqzsyJh" & g1 == "group_2" & g2 == "E") 100 18.987 1 18.993 0 0 0
>
> library(data.table)
> dt.test <- data.table(df.test)
> benchmark(replications = 100,dt.test[(uid=='appoBThmLxqFTyjFWyAqzsyJh') &
+ (g1 == 'group_2') &
+ (g2 == 'E'),value])
test replications elapsed relative user.self sys.self user.child sys.child
1 dt.test[(uid == "appoBThmLxqFTyjFWyAqzsyJh") & (g1 == "group_2") & (g2 == "E"), value] 100 10.376 1 10.374 0.002 0 0
> setkey(dt.test,uid,g1,g2)
> #rm(dt.test)
> benchmark(replications = 100,dt.test[(uid=='appoBThmLxqFTyjFWyAqzsyJh') &
+ (g1 == 'group_2') &
+ (g2 == 'E'),value])
test replications elapsed relative user.self sys.self user.child sys.child
1 dt.test[(uid == "appoBThmLxqFTyjFWyAqzsyJh") & (g1 == "group_2") & (g2 == "E"), value] 100 13.244 1 13.261 0 0 0
答案 0 :(得分:5)
你并没有真正使用&#34; data.table&#34;有效地进行尝试。例如,在设置key
后,您应该考虑在&#34; data.table&#34;中使用J
。
在这里,我重新创建了一些样本数据(共享一些样本数据使其他人更容易回答这些问题),并且我已经创建了一些基准测试功能。
这是示例数据。改变&#34; n
&#34;如果您想尝试不同大小的数据集:
library(stringi) ## for generating random strings
set.seed(1)
uid <- stri_rand_strings(10000, 5)
g1 <- paste0("g", 1:1000)
g2 <- c(letters, LETTERS)
n <- 1000000
df.test <- data.frame(
uid = sample(uid, n, TRUE),
g1 = sample(g1, n, TRUE),
g2 = sample(g2, n, TRUE),
value = rnorm(n)
)
df.test[200, ] ## The 200th row
以下是您的尝试:
f1 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
df.test[(df.test$uid == a1) & (df.test$g1 == a2) & (df.test$g2 == a3), ]
}
f2 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
subset(df.test, uid == a1 & g1 == a2 & g2 == a3)
}
library(data.table)
dt.test <- data.table(df.test)
dt.test.keyed <- copy(dt.test)
setkey(dt.test.keyed, uid, g1, g2)
f3 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
dt.test[uid == a1 & g1 == a2 & g2 == a3]
}
f4 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
dt.test.keyed[uid == a1 & g1 == a2 & g2 == a3]
}
这里还有一个&#34; data.table&#34;和#34; dplyr&#34;:
f5 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
dt.test.keyed[J(a1, a2, a3)]
}
library(dplyr)
f6 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
filter(df.test, uid == a1 & g1 == a2 & g2 == a3)
}
而且,结果如下:
library(microbenchmark)
out <- microbenchmark(f1(), f2(), f3(), f4(), f5(), f6())
out
# Unit: milliseconds
# expr min lq mean median uq max neval
# f1() 315.560939 327.623885 340.639557 335.504160 342.442239 403.29851 100
# f2() 333.233436 350.439403 362.876115 356.168562 366.324454 440.86664 100
# f3() 227.923877 237.390578 249.932411 241.037701 246.196354 329.29018 100
# f4() 222.598481 232.748170 242.396059 237.787355 243.125148 302.71212 100
# f5() 1.606372 1.931555 2.602466 2.083269 2.367882 12.00145 100
# f6() 233.259460 243.932592 255.202134 249.279015 257.420772 329.48901 100
boxplot(out) ## That's a log scale there....