R最快速的方式条件选择

时间:2014-08-20 14:51:11

标签: r performance dataframe selection data.table

快速条件选择有更快的方法吗? 也许更好的将data.frame转换为另一种类型? 在这个测试版本中,我有大约700k行,但可能有数百万?

我对基准测试感到疑惑,因为一切都在记忆中。 替代方案可能是通过db进行一些额外的工作(ddl,索引)。

> str(df.test)
'data.frame':   694118 obs. of  4 variables:
 $ uid  : chr  "ZyVOZrPOXwkuGSPv" "qBwuxhbrszRcISSRmIlYaQXHRUZE" "azCESULsUinrAeFkGIjEZpOLhrJcnB" "yLXPfpGlnLrtKmCRERj" ...
 $ g1   : chr  "group_70" "group_85" "group_150" "group_32" ...
 $ g2   : chr  "D" "A" "A" "C" ...
 $ value: num  0.7756 0.1389 0.8924 0.2278 0.0709 ...
> df.test[200,]
              uid      g1 g2 value
200 appoBThmLxqFTyjFWyAqzsyJh group_2  E 0.604
> 
> benchmark(replications = 100,df.test[(df.test$uid=='appoBThmLxqFTyjFWyAqzsyJh') & 
+                                            (df.test$g1 == 'group_2') & 
+                                            (df.test$g2 == 'E'),'value'])
                                                          test replications elapsed relative user.self sys.self user.child sys.child
1 df.test[(df.test$uid == "appoBThmLxqFTyjFWyAqzsyJh") & (df.test$g1 == "group_2") & (df.test$g2 == "E"), "value"]          100   10.72        1    10.713    0.007          0         0
> 
> benchmark(replications = 100,subset(df.test,uid=='appoBThmLxqFTyjFWyAqzsyJh' & g1 == 'group_2' & g2== 'E' ))
                                           test replications elapsed relative user.self sys.self user.child sys.child
1 subset(df.test, uid == "appoBThmLxqFTyjFWyAqzsyJh" & g1 == "group_2" & g2 == "E")          100  18.987        1    18.993        0          0         0
> 
> library(data.table)          
> dt.test <- data.table(df.test)
> benchmark(replications = 100,dt.test[(uid=='appoBThmLxqFTyjFWyAqzsyJh') & 
+                                       (g1 == 'group_2') & 
+                                       (g2 == 'E'),value])
                                            test replications elapsed relative user.self sys.self user.child sys.child
1 dt.test[(uid == "appoBThmLxqFTyjFWyAqzsyJh") & (g1 == "group_2") & (g2 == "E"), value]          100  10.376        1    10.374    0.002          0         0
> setkey(dt.test,uid,g1,g2)
> #rm(dt.test)                     
> benchmark(replications = 100,dt.test[(uid=='appoBThmLxqFTyjFWyAqzsyJh') & 
+                                       (g1 == 'group_2') & 
+                                       (g2 == 'E'),value])
                                            test replications elapsed relative user.self sys.self user.child sys.child
1 dt.test[(uid == "appoBThmLxqFTyjFWyAqzsyJh") & (g1 == "group_2") & (g2 == "E"), value]          100  13.244        1    13.261        0          0         0

1 个答案:

答案 0 :(得分:5)

你并没有真正使用&#34; data.table&#34;有效地进行尝试。例如,在设置key后,您应该考虑在&#34; data.table&#34;中使用J

在这里,我重新创建了一些样本数据(共享一些样本数据使其他人更容易回答这些问题),并且我已经创建了一些基准测试功能。

这是示例数据。改变&#34; n&#34;如果您想尝试不同大小的数据集:

library(stringi) ## for generating random strings
set.seed(1)
uid <- stri_rand_strings(10000, 5)
g1 <- paste0("g", 1:1000)
g2 <- c(letters, LETTERS)
n <- 1000000
df.test <- data.frame(
  uid = sample(uid, n, TRUE),
  g1 = sample(g1, n, TRUE),
  g2 = sample(g2, n, TRUE),
  value = rnorm(n)
)

df.test[200, ] ## The 200th row

以下是您的尝试:

f1 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  df.test[(df.test$uid == a1) & (df.test$g1 == a2) & (df.test$g2 == a3), ]
} 
f2 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  subset(df.test, uid == a1 & g1 == a2 & g2 == a3)
} 

library(data.table)        
dt.test <- data.table(df.test)
dt.test.keyed <- copy(dt.test)
setkey(dt.test.keyed, uid, g1, g2)

f3 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  dt.test[uid == a1 & g1 == a2 & g2 == a3]
}

f4 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  dt.test.keyed[uid == a1 & g1 == a2 & g2 == a3]
}

这里还有一个&#34; data.table&#34;和#34; dplyr&#34;:

f5 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  dt.test.keyed[J(a1, a2, a3)]
}

library(dplyr)
f6 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  filter(df.test, uid == a1 & g1 == a2 & g2 == a3)
}

而且,结果如下:

library(microbenchmark)
out <- microbenchmark(f1(), f2(), f3(), f4(), f5(), f6())
out
# Unit: milliseconds
#  expr        min         lq       mean     median         uq       max neval
#  f1() 315.560939 327.623885 340.639557 335.504160 342.442239 403.29851   100
#  f2() 333.233436 350.439403 362.876115 356.168562 366.324454 440.86664   100
#  f3() 227.923877 237.390578 249.932411 241.037701 246.196354 329.29018   100
#  f4() 222.598481 232.748170 242.396059 237.787355 243.125148 302.71212   100
#  f5()   1.606372   1.931555   2.602466   2.083269   2.367882  12.00145   100
#  f6() 233.259460 243.932592 255.202134 249.279015 257.420772 329.48901   100

boxplot(out) ## That's a log scale there....

enter image description here