查找矢量和列表元素之间的交集

时间:2015-04-13 01:40:59

标签: r list vector bigdata intersection

有没有更快的方法来查找向量与列表中每个元素之间的交集长度?

v <- c("b", "l")
l <- list(c("a", "b", "c"),
    c("d", "b", "a"),
    c("m", "l", "h", "b"))
unlist(lapply(l, function(x) length(intersect(v, x))))

1 个答案:

答案 0 :(得分:1)

您可以尝试

 vapply(l, function(x) sum(is.element(x,v)), 0)

或者

  library(stringi)
  m1 <- stri_list2matrix(l)
  colSums(`dim<-`(m1 %in% v, dim(m1)))

基准

fr <- function() rapply(l, function(x) length(intersect(v,x)))
fl <- function() unlist(lapply(l, function(x) length(intersect(v, x))))
fv <- function() vapply(l, function(x) length(intersect(v, 
                            x)),FUN.VALUE=integer(1))
fis <- function() vapply(l, function(x) sum(is.element(x,v)), 0)

library(stringi)
fN <- function() { m1 <- stri_list2matrix(l)
            colSums(`dim<-`(m1 %in% v, dim(m1)))}

 set.seed(29)
 v <-  paste0(sample(letters, 20, replace=FALSE), 1:100)
 l <- replicate(1e5,paste0(sample(letters[1:20], sample(10), 
          replace=TRUE), 1:50), simplify=FALSE) 

 system.time(fr())
 # user  system elapsed 
 # 2.914   0.006   2.927 
 system.time(fl())
 #  user  system elapsed 
 # 3.023   0.008   3.036 

 system.time(res1 <- fis())
 #  user  system elapsed 
 # 0.804   0.003   0.809 

 system.time(res2 <- fv())
 #  user  system elapsed 
 # 3.285   0.006   3.301 

 system.time(res3 <- fN())
 #  user  system elapsed 
 # 0.380   0.016   0.396 

all.equal(res1, res2)
#[1] TRUE

all.equal(res1, res3)
#[1] TRUE

system.time中选择一些有效的方法并运行microbenchmark

library(microbenchmark)
microbenchmark(fv(), fis(), fN(), unit='relative', times=20L) 
#Unit: relative
#expr       min        lq      mean    median        uq       max neval cld
#  fv() 14.665591 13.534939 13.474595 12.224042 14.058256 15.172026    20  c
# fis()  3.442919  3.261627  4.161515  2.930333  3.440015  8.006546    20  b
#  fN()  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    20  a