如何计算字符串中某些字符的频率?

时间:2015-06-24 09:55:05

标签: regex r string

如果我有一系列字符,例如"AABBABBBAAAABBAAAABBBAABBBBABABB"

有没有办法让R计算A的运行次数并说明每个长度的数量?

所以我想知道连续3个A的实例数,单个A的实例数,2个A的实例数等等。

4 个答案:

答案 0 :(得分:10)

尝试

 v1 <- scan(text=gsub('[^A]+', ',', str1), sep=',', what='', quiet=TRUE)
 table(v1[nzchar(v1)])
 # A   AA AAAA 
 # 3    2    2 

或者

 library(stringi)
 table(stri_extract_all_regex(str1, '[A]+')[[1]])
 # A   AA AAAA 
 # 3    2    2 

基准

 set.seed(42)
 x1 <- stri_rand_strings(1,1e7, pattern='[A-G]')

 system.time(table(stri_split_regex(x1, "[^A]+", omit_empty = TRUE)))
 #   user  system elapsed 
 #  0.829   0.002   0.831 

 system.time(table(stri_extract_all_regex(x1, '[A]+')[[1]]))
 #   user  system elapsed 
 #   0.790   0.002   0.791 

 system.time(table(rle(strsplit(x1,"")[[1]])) )
 #   user  system elapsed 
 #  30.230   1.243  31.523 

 system.time(table(strsplit(x1, "[^A]+")))
 # user  system elapsed 
 # 4.253   0.006   4.258 


 system.time(table(attr(gregexpr("A+",x1)[[1]], 'match.length')))
 #  user  system elapsed 
 #  1.994   0.004   1.999 


 library(microbenchmark)
 microbenchmark(david=table(stri_split_regex(x1, "[^A]+", omit_empty = TRUE)),
    akrun=  table(stri_extract_all_regex(x1, '[A]+')[[1]]),
    david2 =  table(strsplit(x1, "[^A]+")),
    glen = table(rle(strsplit(x1,"")[[1]])),
    plannapus = table(attr(gregexpr("A+",x1)[[1]], 'match.length')),
         times=20L, unit='relative')

#Unit: relative
#     expr       min        lq      mean    median         uq       max    neval  cld
#   david  1.0000000  1.000000  1.000000  1.000000  1.0000000  1.000000    20       a  
#   akrun  0.7908313  1.023388  1.054670  1.336510  0.9903384  1.004711    20       a
#  david2  4.9325256  5.461389  5.613516  6.207990  5.6647301  5.374668    20       c 
#    glen 14.9064240 15.975846 16.672339 20.570874 15.8710402 15.465140    20       d
#plannapus 2.5077719  3.123360  2.836338  3.557242  2.5689176  2.452964    20       b 

数据

 str1 <- 'AABBABBBAAAABBAAAABBBAABBBBABABB'

答案 1 :(得分:10)

table(rle(strsplit("AABBABBBAAAABBAAAABBBAABBBBABABB","")[[1]]))

给出

       values
lengths A B
      1 3 1
      2 2 3
      3 0 2
      4 2 1

(读下A栏)意味着有3个A长度为1个,2个A长度为2个,2个长度为4个。

答案 2 :(得分:8)

使用strsplit

的其他方法
x <- "AABBABBBAAAABBAAAABBBAABBBBABABB"
table(strsplit(x, "[^A]+"))
# A   AA AAAA 
# 3    2    2 

或类似于stringi

library(stringi)
table(stri_split_regex(x, "[^A]+", omit_empty = TRUE))

答案 3 :(得分:3)

为了完整起见,这是使用regmatchesgregexpr组合提取正则表达式的另一种方法:

x <- "AABBABBBAAAABBAAAABBBAABBBBABABB"
table(regmatches(x,gregexpr("A+",x))[[1]])
#   A   AA AAAA 
#   3    2    2

或者事实上,由于gregexpr将捕获的子字符串的长度保持为属性,甚至可以直接执行:

table(attr(gregexpr("A+",x)[[1]],'match.length'))
# 1 2 4 
# 3 2 2