查找字符串中重复字符的位置信息

时间:2014-04-29 09:45:15

标签: string r text-mining

我有一个DNA序列,我试图通过可视化n(通用核苷酸)的存在和频率来确定组装的质量。我可以通过将其放入特定格式并将其加载到其他软件中来实现。

输出为tsv / csv,其中包含以下列名称:startposition endposition value(end-start)

我的第一个想法是从字符串中创建一个字符向量,然后循环记录n块的开始和停止位置。它有效,但我几乎可以肯定有更好更简单的方法。我对正则表达式不太满意,但愿意在一些帮助下使用它。

stringa<-"attaaatagccgcggaagacctttttcatatgatagaatccccaacacannnnnnacgtagacaeagagaagattttcccccggagagcgcgaatagannnnnnnnnccataatatttataaattaatttat"
stringvector<-strsplit(stringa, NULL)[[1]]

newdf<-data.frame(start=rep(NA, 10), end=rep(NA, 10), length=rep(NA, 10))
counter=1

for (i in 1:length(stringvector)){
    if (stringvector[i]=="n"){
        if (is.na(newdf[counter,][1])){
                #record start position
                newdf[counter,][1]<-i
        } else {
            #do nothing you found the start already
        }
    }
    if (stringvector[i]!="n" & !is.na(newdf[counter,][1]) & is.na(newdf[counter,][2])){
        newdf[counter,][2]<-i-1
        newdf[counter,][3]<-newdf[counter,][2]-newdf[counter,][1]
        counter=counter+1
    }
}

序列长度不应大于约5000万个字符,我不需要为许多序列执行此操作,但希望更优雅。

有什么想法吗?

3 个答案:

答案 0 :(得分:4)

不确定是否更优雅,但如果使用gregexpr,肯定会更快。

result <- gregexpr(pattern="n+", c(stringa))[[1]]
new.df <- data.frame(start=result, length=attr(result, "match.length"))

请注意,如果stringa是向量,此解决方案也只需要稍作修改即可。在这种情况下,gregexpr将返回需要解压缩的列表。它可以在没有循环的情况下完成,具有lapplydo.call(rbind,...)

的组合

答案 1 :(得分:4)

使用stringi包的另一种可能性:

library(stringi)
m <- stri_locate_all(str = "annaannn", regex = "n+")[[1]]
length <- m[ , "end"] - m[ , "start"] + 1
cbind(m, length)
#      start end length
# [1,]     2   3      2
# [2,]     6   8      3

修改:添加基准。 stri_locate_all似乎最快。请注意,我在@ ilir的答案中添加了一个“结束”变量,使其与其他两个替代品更具可比性。

fun_stri <- function(x){
  m <- stri_locate_all(str = x, regex = "n+")[[1]]
  length <- m[ , "end"] - m[ , "start"] + 1
  cbind(m, length)
}

fun_greg <- function(x){
  result <- gregexpr(pattern="n+", c(x))[[1]]
  data.frame(start = result,
             end = result + attr(result, "match.length") - 1,
             length = attr(result, "match.length"))
}

fun_rle <- function(stringa, char = "n") { 
  tmp <- rle(strsplit(stringa, NULL)[[1]])
  start <- sapply(which(tmp$values == char)-1, function(x) sum(tmp$length[1:x]))+1 
  length <- tmp$length[tmp$values == char]
  return(data.frame(start, end = start + length, length))
}

# just check results on short strings
fun_stri("annaannn")
fun_greg("annaannn")
fun_rle("annaannn")

fun_stri(stringa)
fun_greg(stringa)
fun_rle(stringa) 

library(microbenchmark)

# size = 1e4
x <- paste(sample(c("a", "t", "c", "g", "n"), size = 1e4, replace = TRUE), collapse = "")
microbenchmark(
  fun_stri(x),
  fun_greg(x),
  fun_rle(x),
  times = 10)

# Unit: microseconds
#        expr       min        lq     median        uq       max neval
# fun_stri(x)   535.221   574.753   632.9140   684.611   711.980    10
# fun_greg(x)   709.699   748.473   776.9815   790.286   913.068    10
#  fun_rle(x) 47588.602 48281.955 50071.7875 67811.410 87781.053    10

# size = 1e5 
x <- paste(sample(c("a", "t", "c", "g", "n"), 1e5, replace = TRUE), collapse = "")
microbenchmark(
  fun_stri(x),
  fun_greg(x),
  fun_rle(x),
  times = 10)

# Unit: milliseconds
#        expr         min          lq      median          uq         max neval
# fun_stri(x)    2.871487    2.963478    3.011184    3.202578    3.272142    10
# fun_greg(x)    4.842070    4.914295    5.013888    5.368927    5.490949    10
#  fun_rle(x) 3853.887170 3868.795788 3889.699217 4228.450830 4411.025536    10

# size = 5e7 ("about 50 million characters" in OP)
x <- paste(sample(c("a", "t", "c", "g", "n"), size = 5e7, replace = TRUE), collapse = "")
microbenchmark(
  fun_stri(x),
  fun_greg(x),
  times = 10)

# Unit: seconds
#        expr      min      lq   median       uq      max neval
# fun_stri(x) 1.479089 1.54405 1.606676 1.757292 1.974795    10
# fun_greg(x) 2.381448 2.39754 2.422554 2.476974 2.643259    10

答案 2 :(得分:2)

这样的东西?

rle_ <- function(stringa, char = "n") { 
  tmp <- rle(strsplit(stringa, NULL)[[1]])
  start <- sapply(which(tmp$values == char)-1, function(x) sum(tmp$length[1:x]))+1 
  length <- tmp$length[tmp$values == char]
  return(data.frame(start, end = start + length, length))
}
rle_(stringa)
# start end length
# 1    50  56      6
# 2    99 108      9