我想使用正则表达式捕获子字符串 - 我已经有了一个可行的解决方案,但我想知道是否有更快的解决方案。我将applyCaptureRegex
应用于包含大约400.000个条目的向量。
exampleData <- as.data.frame(c("[hg19:21:34809787-34809808:+]","[hg19:11:105851118-105851139:+]","[hg19:17:7482245-7482266:+]","[hg19:6:19839915-19839936:+]"))
captureRegex <- function(captRegEx,str){
sapply(regmatches(str,gregexpr(captRegEx,str))[[1]], function(m) regmatches(m,regexec(captRegEx,m)))
}
applyCaptureRegex <- function(mir,r){
mir <- unlist(apply(mir, 1, function(x) captureRegex(r,x[1])))
mir <- matrix(mir ,ncol=5, byrow = TRUE)
mir
}
用法和结果:
> captureRegex("\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]","[hg19:12:125627828-125627847:-]")
$`[hg19:12:125627828-125627847:-]`
[1] "[hg19:12:125627828-125627847:-]" "12" "125627828" "125627847" "-"
> applyCaptureRegex(exampleData,"\\[[a-z0-9]+:([0-9]+):([0-9]+)-([0-9]+):([-+])\\]")
[,1] [,2] [,3] [,4] [,5]
[1,] "[hg19:21:34809787-34809808:+]" "21" "34809787" "34809808" "+"
[2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+"
[3,] "[hg19:17:7482245-7482266:+]" "17" "7482245" "7482266" "+"
[4,] "[hg19:6:19839915-19839936:+]" "6" "19839915" "19839936" "+"
谢谢!
答案 0 :(得分:8)
为什么重新发明轮子?您有多个库包可供选择,其中的函数返回一个字符矩阵,其中一个列用于模式中的每个捕获组。
stri_match_all_regex - stringi
x <- c('[hg19:21:34809787-34809808:+]', '[hg19:11:105851118-105851139:+]', '[hg19:17:7482245-7482266:+]', '[hg19:6:19839915-19839936:+]')
do.call(rbind, stri_match_all_regex(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
# [,1] [,2] [,3] [,4] [,5]
# [1,] "[hg19:21:34809787-34809808:+]" "21" "34809787" "34809808" "+"
# [2,] "[hg19:11:105851118-105851139:+]" "11" "105851118" "105851139" "+"
# [3,] "[hg19:17:7482245-7482266:+]" "17" "7482245" "7482266" "+"
# [4,] "[hg19:6:19839915-19839936:+]" "6" "19839915" "19839936" "+"
str_match - stringr
str_match(x, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
strapplyc - gsubfn
strapplyc(x, "(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])", simplify = rbind)
以下是所有组合解决方案的基准比较。
x <- rep(c('[hg19:21:34809787-34809808:+]',
'[hg19:11:105851118-105851139:+]',
'[hg19:17:7482245-7482266:+]',
'[hg19:6:19839915-19839936:+]'), 1000)
applyCaptureRegex <- function(mir, r) {
do.call(rbind, lapply(mir, function(x) regmatches(x, regexec(r, x))[[1]]))
}
gsubfn <- function(x1) strapplyc(x1, '(\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])])', simplify = rbind)
regmtch <- function(x1) applyCaptureRegex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringr <- function(x1) str_match(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]')
stringi <- function(x1) do.call(rbind, stri_match_all_regex(x1, '\\[[^:]+:(\\d+):(\\d+)-(\\d+):([-+])]'))
require(microbenchmark)
microbenchmark(gsubfn(x), regmtch(x), stringr(x), stringi(x))
结果
Unit: milliseconds
expr min lq mean median uq max neval
gsubfn(x) 372.27072 382.82179 391.21837 388.32396 396.27361 449.03091 100
regmtch(x) 394.03164 409.87523 419.42936 417.76770 427.08208 456.92460 100
stringr(x) 65.81644 70.28327 76.02298 75.43162 78.92567 116.18026 100
stringi(x) 15.88171 16.53047 17.52434 16.96127 17.76007 23.94449 100