加速data.table交叉

时间:2015-10-24 17:18:05

标签: r data.table

我正在寻找更快的方式来计算这类事情。 (真正的数据库有109 964 694行)。

DT<-data.table(id=c(1,2,1,4,2,1,7,8,8,10),
               effect=c("A","A","B","B","B","C","C","C","A","D"),
               value=1)

我想要effect =="A"effect == "C"

的ID
intersect(DT[effect=="A",(id)], DT[effect=="C",(id)])

还有其他更快的方法吗?

我还想将effect =="A"effect =="C"的ID设为..而不是effect=="B"

setdiff(
  intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),
DT[effect=="B",(id)]
)

任何更好(更快)的想法?

此致

PS:我尝试过这种事情:

DT[,effect=="diag_998" | effect=="diag_1900",by=id][,sum(V1)==2,by=id][V1==TRUE,(id)]

但它非常非常慢......

编辑:感谢你的提议。

似乎没有什么比V1和A1更快:

system.time(V1<-intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]))# 0.014 sec
system.time(V2<-DT[effect=="A" | effect=="C", unique(id[duplicated(id)])]) #17sec
system.time(V3<-DT[,list(cond=all(c("A","C") %in% effect)),by=id][cond==TRUE,id] ) #more than 1 min
system.time(V4<-DT[effect=="A" | effect=="C", .N[.N > 1], by = id]$id) # 17 sec
system.time(V5<-DT[, .GRP[sum(c("A", "C") %chin% effect)==2], id]$id) # 103 sec

system.time(V6<-DT[, .GRP[sum(c("A", "C") %in% effect)==2], id]$id)#more than 1 min
setkey(DT, effect)
system.time(V7<-DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)#0.19 sec



system.time(A1<-setdiff(intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),DT[effect=="D",(id)])) # 0.014 sec
system.time(A2<-DT[,list(cond=all(c("A","C") %in% effect) & (!"D" %in% effect)),by=id][cond==TRUE,id])#more than 1 min
system.time(A3<-DT[,list(cond=all(c("A","C") %chin% effect) & (!"D" %chin% effect)),by=id][cond==TRUE,id])
system.time(A4<-DT[.(c("A", "C", "D")), if (.N == 2 & !("D" %in% effect)) TRUE, by = id]$id)# 1sec
编辑:另一个基准,感谢@MichaelChirico

microbenchmark(times=50L,
+                bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
+                bakal.keyed=intersect(DT["A", id], DT["C", id]),
+                rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
+                rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: milliseconds
        expr       min        lq      mean    median        uq       max neval
       bakal 10.963171 11.003494 11.072844 11.019909 11.060331 12.641751    50
 bakal.keyed 10.738537 10.794715 10.878960 10.838630 10.892020 12.416713    50
       rscr1  9.504886  9.572026  9.662024  9.598426  9.645478 11.127047    50
 rscr1.keyed  9.013076  9.037370  9.065215  9.065669  9.083492  9.206366    50

1 个答案:

答案 0 :(得分:2)

看来你走的是正确的道路;我只能建议加快速度。

这是一个小型的样本数据集,用于清除最糟糕的尝试:

set.seed(10239)
NN<-1e6
DT<-data.table(id=sample(8e5,NN,T),
               effect=sample(LETTERS[1:4],NN,T),
               val=rnorm(NN),key="effect")

我最初的建议是使用键控,但不是最好的方式,基本上是:setkey(DT,effect); DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id

以下是此方法的基准,您的原始方法以及评论中提到的其他方法(我进行了一些优化,例如在@ nicola的方法中用cond==TRUE替换(cond) @ RichardScriven的effect=="A"|effect=="C" effect %in% c("A", C")

library(microbenchmark)
microbenchmark(times=50L,
               bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
               rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
               nicol=DT[,.(cond=all(c("A","C") %in% effect)), by=id][(cond), id],
               rscr2=DT[effect %in% c("A","C"), .N[.N > 1], by = id]$id,
               akrun=DT[, .GRP[sum(c("A", "C") %chin% effect) == 2], id]$id,
               mikec=DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)

我的机器上有结果:

Unit: milliseconds
  expr        min         lq       mean     median         uq        max neval   cld
 bakal   14.82926   15.18540   17.86200   15.56453   16.70924   64.99443    50 a    
 rscr1   13.41102   13.98252   20.11127   14.93054   18.02248   66.14476    50 a    
 nicol 1329.82013 1377.03884 1436.45650 1404.48956 1483.47944 1758.00831    50     e
 rscr2  260.54888  269.86605  294.05219  276.66802  310.76356  479.50419    50   c  
 akrun  997.43300 1075.17322 1103.06220 1095.08246 1118.16848 1360.80793    50    d 
 mikec  154.39418  158.90884  180.01096  163.32763  206.59246  235.45523    50  b   

让我们看看这些在你提到的data.table上的表现如何:

set.seed(12039)
NN<-1e8
DT<-data.table(id=sample(8e5,NN,T),
               effect=sample(LETTERS[1:4],NN,T),
               val=rnorm(NN),key="effect")
microbenchmark(times=50L,
               bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
               bakal.keyed=intersect(DT["A", id], DT["C", id]),
               rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
               rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: seconds
        expr      min       lq     mean   median       uq      max neval cld
       bakal 3.772309 3.991414 4.395669 4.408319 4.681609 5.170224    50  a 
 bakal.keyed 3.708500 3.807447 4.289518 4.384870 4.653427 5.085429    50  a 
       rscr1 4.962940 5.210845 5.721636 5.707369 6.162103 6.779900    50   b
 rscr1.keyed 4.904702 5.117411 5.727848 5.807186 6.194990 6.975508    50   b