优化版本的grep匹配矢量与矢量

时间:2014-01-25 13:01:51

标签: regex r optimization vector grep

假设我有两个字符向量ab

set.seed(123)
categ <- c("Control", "Gr", "Or", "PMT", "P450")
genes <- paste(categ, rep(1:40, each=length(categ)), sep="_")
a0 <- paste(genes, "_", rep(1:50, each=length(genes)), "_", sep="")
b0 <- paste (a0, "1", sep="")
ite <- 200
lg <- 2000
b <- b0[1:lg]
a <- (a0[1:lg])[sample(seq(lg), ite)]

我想应用grep函数,以查找ab的每个值的匹配项。 我当然可以这样做:

sapply(a, grep, b)

但是我想知道是否有更高效的东西,因为我必须在模拟中为更大的向量运行这么多次(注意我不想使用mclapply因为我已经使用它运行我的模拟的每次迭代):

system.time(lapply(seq(100000), function(x) sapply(a, grep, b)))
library(parallel)
system.time(mclapply(seq(100000), function(x) sapply(a, grep, b), mc.cores=8))  

4 个答案:

答案 0 :(得分:8)

由于您不使用正则表达式但希望在较长字符串中查找子字符串,因此可以使用fixed = TRUE。它要快得多。

library(microbenchmark)
microbenchmark(lapply(a, grep, b),                 # original
                 lapply(paste0("^", a), grep, b),  # @flodel
                 lapply(a, grep, b, fixed = TRUE))

Unit: microseconds
                             expr     min       lq   median       uq     max neval
               lapply(a, grep, b) 112.633 114.2340 114.9390 116.0990 326.857   100
  lapply(paste0("^", a), grep, b) 119.949 121.7380 122.7425 123.9775 191.851   100
 lapply(a, grep, b, fixed = TRUE)  21.004  22.5885  23.8580  24.6110  33.608   100

使用较长的矢量进行测试(原始长度的1000倍)。

ar <- rep(a, 1000)
br <- rep(b, 1000)

library(microbenchmark)
microbenchmark(lapply(ar, grep, br),               # original
               lapply(paste0("^", ar), grep, br),  # @flodel
               lapply(ar, grep, br, fixed = TRUE))

Unit: seconds
                               expr       min        lq    median       uq       max neval
               lapply(ar, grep, br) 32.288139 32.564223 32.726149 32.97529 37.818299   100
  lapply(paste0("^", ar), grep, br) 24.997339 25.343401 25.531138 25.71615 28.238802   100
 lapply(ar, grep, br, fixed = TRUE)  2.461934  2.494759  2.513931  2.55375  4.194093   100

(这花了很长时间......)

答案 1 :(得分:3)

继我最后的建议......

您所问的最大问题是,先验,您需要进行length(a) * length(b)比较。但是,您可以利用这里的匹配仅发生在字符串的开头(我从评论中收集的内容)这一事实。

我建议您在查看第一个单词(“或”,“Gr”,“控制”,“PMT”等)之后先将ab个向量拆分为列表。在每个项目中,然后只查找相应集合中的匹配项。换句话说,请使用以a开头的Or_中的项目,仅查找b中同样以Or_开头的项目中的匹配项。

让您了解为什么这在复杂性方面是有效的。想象一下,ab的长度均为n;有x个可能的前缀,在ab中均匀分布。然后,在您的情况下,您只需要与x * (n/x * n/x)进行n * n比较。这比较少x倍。你甚至可以想象以递归的方式使用第二个单词,第三个等重复这个过程。

现在这里是代码:

reduced.match <- function(a, b) {

   first.word <- function(string) sub("_.*", "", string)

   a.first <- first.word(a)
   b.first <- first.word(b)
   l.first <- unique(c(a.first, b.first))
   a.first <- factor(a.first, l.first)
   b.first <- factor(b.first, l.first)
   a.split <- split(a, a.first)
   b.split <- split(b, b.first)
   a.idx.split <- split(seq_along(a), a.first)
   b.idx.split <- split(seq_along(b), b.first)

   unsorted.matches <-
     Map(function(a, b, i) lapply(a, function(x) i[grep(x, b, fixed = TRUE)]),
         a.split, b.split, b.idx.split, USE.NAMES = FALSE)

   sorted.matches <-
     unlist(unsorted.matches, recursive = FALSE)[
       match(seq_along(a), unlist(a.idx.split))]

   return(sorted.matches)
}

# sample data
set.seed(123)
n <- 10000
words <- paste0(LETTERS, LETTERS, LETTERS)
a <- paste(sample(words[-1], n, TRUE),
           sample(words, n, TRUE), sep = "_")
b <- paste(sample(words[-2], n, TRUE),
           sample(words, n, TRUE), sep = "_")

# testing
identical(reduced.match(a, b), lapply(a, grep, b, fixed = TRUE))
# [1] TRUE

# benchmarks
system.time(reduced.match(a, b))
#    user  system elapsed 
#   0.187   0.000   0.187 
system.time(lapply(a, grep, b, fixed = TRUE))
#    user  system elapsed 
#   2.915   0.002   2.920 

答案 2 :(得分:3)

如果a和b被排序(并且是唯一的)并且一个人对字符串开头的完全匹配感兴趣,那么下面的C代码通常会相对有效(长度(a)+长度的顺序) (b)字符串比较?)。 R包装器确保C代码和R用户获得适当的数据。

f3 <- local({
    library(inline)
    .amatch <- cfunction(c(a="character", b="character"),
             includes="#include <string.h>", '
         int len_a = Rf_length(a), len_b = Rf_length(b);
         SEXP ans = PROTECT(allocVector(INTSXP, len_b));
         memset(INTEGER(ans), 0, sizeof(int) * len_b);
         int cmp, i = 0, j = 0;
         while (i < len_a) {
             const char *ap = CHAR(STRING_ELT(a, i));
             while (j < len_b) {
                 cmp = strncmp(ap, CHAR(STRING_ELT(b, j)), strlen(ap));
                 if (cmp > 0) {
                     j += 1;
                 } else break;
             }
             if (j == len_b)
                 break;
             if (cmp == 0)
                 INTEGER(ans)[j++] = i + 1;
             else if (cmp < 0) i += 1;
         }
         UNPROTECT(1);
         return(ans);')

    function(a, b) {
        locale = Sys.getlocale("LC_COLLATE")
        if (locale != "C") {
            warning('temporarily trying to set LC_COLLATE to "C"')
            Sys.setlocale("LC_COLLATE", "C")
            on.exit(Sys.setlocale("LC_COLLATE", locale))
        }
        a0 <- a
        lvls <- unique(a)
        a <- sort(lvls)
        o <- order(b)
        idx <- .amatch(a, b[o])[order(o)]
        f <- factor(a[idx[idx != 0]], levels=lvls)
        split(which(idx != 0), f)[a0]
    }
})

与这个半友好的grep相比

f0 <- function(a, b) {
    a0 <- a
    a <- unique(a)
    names(a) <- a
    lapply(a, grep, b, fixed=TRUE)[a0]
}

允许(但不会付出太多代价)复制'a'值@ flodel的数据集的时间是

> microbenchmark(f0(a, b), f3(a, b), times=5)
Unit: milliseconds
     expr       min        lq    median        uq       max neval
 f0(a, b) 431.03595 431.45211 432.59346 433.96036 434.87550     5
 f3(a, b)  15.70972  15.75976  15.93179  16.05184  16.06767     5

不幸的是,当一个元素是另一个元素的前缀

时,这个简单的算法会失败
> str(f0(c("a", "ab"), "abc"))
List of 2
 $ : chr "abc"
 $ : chr "abc"
> str(f3(c("a", "ab"), "abc"))
List of 2
 $ : chr "abc"
 $ : chr(0) 

与评论相反,对于此数据集(需要为可重复性指定随机数种子)

set.seed(123)
categ <- c("Control", "Gr", "Or", "PMT", "P450")
genes <- paste(categ, rep(1:40, each=length(categ)), sep="_")
a0 <- paste0(genes, "_", rep(1:50, each=length(genes)), "_")
b0 <- paste0(a0, "1")
ite <- 50
lg <- 1000
b <- b0[1:lg]
a <- (a0[1:lg])[sample(seq(lg), ite)]

f3()返回与grep

相同的值
> identical(unname(f3(a, b)), lapply(a, grep, b, fixed=TRUE))
[1] TRUE

已修改算法f0和f3以返回命名列表中的索引。

答案 3 :(得分:1)

我在自己的数据上测试了@flodel和@Sven Hohenstein提出的不同解决方案(注意@Martin Morgan的方法暂时无法测试,因为它不支持a的元素。 a)的其他元素的前缀。

重要说明:尽管所有方法都在我的具体情况下给出了相同的结果,但提醒他们都有自己的方式,因此可以根据数据的结构给出不同的结果

以下是快速摘要(结果如下所示):

  1. 在我的测试中,length(a)length(b)分别设置为200或400和2,000或10,000
  2. a
  3. b的每个值只有一个匹配项
  4. 最好的方法确实取决于问题而且所有特定情况都需要进行测试
  5. pmatch总是表现得非常好(特别是对于较小长度的向量ab,分别小于100和1,000 - 未在下面显示),
  6. sapply(a, grep, b, fixed=T)reduced.match(flodel方法)功能的效果始终优于sapply(a, grep, b))sapply(paste0("^", a), grep, b)
  7. 以下是可重现的代码以及测试结果

    # set up the data set
    library(microbenchmark)
    categ <- c("Control", "Gr", "Or", "PMT", "P450")
    genes <- paste(categ, rep(1:40, each=length(categ)), sep="_")
    a0 <- paste(genes, "_", rep(1:50, each=length(genes)), "_", sep="")
    b0 <- paste (a0, "1", sep="")
    
    # length(a)==200 & length(b)==2,000
    ite <- 200
    lg <- 2000
    b <- b0[1:lg]
    a <- (a0[1:lg])[sample(seq(lg), ite)]
    
    microbenchmark(as.vector(sapply(a, grep, b)),                 # original
                   as.vector(sapply(paste0("^", a), grep, b)),  # @flodel 1
                   as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
                   unlist(reduced.match(a, b)), # @ flodel 2
    #~               f3(a, b), @Martin Morgan
                   pmatch(a, b))
    
    Unit: milliseconds
                                            expr        min         lq     median
                   as.vector(sapply(a, grep, b)) 188.810585 189.256705 189.827765
      as.vector(sapply(paste0("^", a), grep, b)) 157.600510 158.113507 158.560619
     as.vector(sapply(a, grep, b, fixed = TRUE))  23.954520  24.109275  24.269991
                     unlist(reduced.match(a, b))   7.999203   8.087931   8.140260
                                    pmatch(a, b)   7.459394   7.489923   7.586329
             uq        max neval
     191.412879 222.131220   100
     160.129008 186.695822   100
      25.923741  26.380578   100
       8.237207  10.063783   100
       7.637560   7.888938   100
    
    
    # length(a)==400 & length(b)==2,000
    ite <- 400
    lg <- 2000
    b <- b0[1:lg]
    a <- (a0[1:lg])[sample(seq(lg), ite)]
    
    microbenchmark(as.vector(sapply(a, grep, b)),                 # original
                   as.vector(sapply(paste0("^", a), grep, b)),  # @flodel 1
                   as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
                   unlist(reduced.match(a, b)), # @ flodel 2
    #~               f3(a, b), @Martin Morgan
                   pmatch(a, b))
    
    Unit: milliseconds
                                            expr       min        lq    median
                   as.vector(sapply(a, grep, b)) 376.85638 379.58441 380.46107
      as.vector(sapply(paste0("^", a), grep, b)) 314.38333 316.79849 318.33426
     as.vector(sapply(a, grep, b, fixed = TRUE))  49.56848  51.54113  51.90420
                     unlist(reduced.match(a, b))  13.31185  13.44923  13.57679
                                    pmatch(a, b)  15.15788  15.24773  15.36917
            uq       max neval
     383.26959 415.23281   100
     320.92588 346.66234   100
      52.02379  81.65053   100
      15.56503  16.83750   100
      15.45680  17.58592   100
    
    
    # length(a)==200 & length(b)==10,000
    ite <- 200
    lg <- 10000
    b <- b0[1:lg]
    a <- (a0[1:lg])[sample(seq(lg), ite)]
    
    microbenchmark(as.vector(sapply(a, grep, b)),                 # original
                   as.vector(sapply(paste0("^", a), grep, b)),  # @flodel 1
                   as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
                   unlist(reduced.match(a, b)), # @ flodel 2
    #~               f3(a, b), @Martin Morgan
                   pmatch(a, b))
    
    Unit: milliseconds
                                            expr       min        lq    median
                   as.vector(sapply(a, grep, b)) 975.34831 978.55579 981.56864
      as.vector(sapply(paste0("^", a), grep, b)) 808.79299 811.64919 814.16552
     as.vector(sapply(a, grep, b, fixed = TRUE)) 119.64240 120.41718 120.73548
                     unlist(reduced.match(a, b))  34.23893  34.56048  36.23506
                                    pmatch(a, b)  37.57552  37.82128  38.01727
            uq        max neval
     986.17827 1061.89808   100
     824.41931  854.26298   100
     121.20605  151.43524   100
      36.57896   43.33285   100
      38.21910   40.87238   100
    
    
    
    # length(a)==400 & length(b)==10500
    ite <- 400
    lg <- 10000
    b <- b0[1:lg]
    a <- (a0[1:lg])[sample(seq(lg), ite)]
    
    microbenchmark(as.vector(sapply(a, grep, b)),                 # original
                   as.vector(sapply(paste0("^", a), grep, b)),  # @flodel 1
                   as.vector(sapply(a, grep, b, fixed = TRUE)), # Sven Hohenstein
                   unlist(reduced.match(a, b)), # @ flodel 2
    #~               f3(a, b), @Martin Morgan
                   pmatch(a, b))
    
    Unit: milliseconds
                                            expr        min         lq     median
                   as.vector(sapply(a, grep, b)) 1977.69564 2003.73443 2028.72239
      as.vector(sapply(paste0("^", a), grep, b)) 1637.46903 1659.96661 1677.21706
     as.vector(sapply(a, grep, b, fixed = TRUE))  236.81745  238.62842  239.67875
                     unlist(reduced.match(a, b))   57.18344   59.09308   59.48678
                                    pmatch(a, b)   75.03812   75.40420   75.60641
             uq        max neval
     2076.45628 2223.94624   100
     1708.86306 1905.16534   100
      241.12830  283.23043   100
       59.76167   88.71846   100
       75.99034   90.62689   100