正则表达式匹配和替换R

时间:2015-01-07 04:40:42

标签: regex r gsub

我正在尝试编写一个正则表达式,用逗号之前和之后的第一个冒号之后的两个数字替换此矩阵中的每个元素。还有“./。:。:。:。:。”我想改为“0,0”。

head(data)    
          Offspring-95_CAATCG         Offspring-96_AAACGG           Offspring-97_ACTCTT          
    [1,] "./.:1,7:8:18:262,0,18"     "0/1:18,4:21:56:56,0,591"     "0/0:27,0:27:78:0,78,723"    
    [2,] "0/0:49,0:49:99:0,147,1891" "0/0:107,0:107:99:0,319,4185" "1/1:0,22:22:66:902,66,0"    
    [3,] "0/0:42,0:42:99:0,126,1324"   "./.:.:.:.:."               "0/1:35,88:117:99:3152,0,718"

我试过了:

try <- gsub("\\:[0-9]*\\,[0-9]*\\:", \\1, data) 

所需的输出是:

    Offspring-95_CAATCG         Offspring-96_AAACGG           Offspring-97_ACTCTT 
[1,]    "1,7"                         "18,4"                         "27,0"    
[2,]    "49,0"                       "107,0"                         "0,22" 
[3,]    "42,0"                        "0,0"                         "35,88"   

谢谢,

3 个答案:

答案 0 :(得分:2)

这可以通过

来完成
 sub('[^:]+:([^:]+).*', '\\1', data)
 #   Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
 #[1,] "1,7"               "18,4"              "27,0"             
 #[2,] "49,0"              "107,0"             "0,22"             
 #[3,] "9,4"               "33,13"             "13,0"          

可视化

  [^:]+:([^:]+).*

Regular expression visualization

Debuggex Demo

或使用regmatches

中的base R
  data[] <- regmatches(data, regexpr('(?<=:)[0-9]+,[0-9]+', data, perl=TRUE))

可视化

(?<=:)[0-9]+,[0-9]+

Regular expression visualization

Debuggex Demo

以上regex可与stringrstringi一起使用(适用于大数据集)

library(stringr)
`dim<-`(str_extract(data, perl('(?<=:)[0-9]+,[0-9]+')), dim(data))
 #     [,1]   [,2]    [,3]  
 #[1,] "1,7"  "18,4"  "27,0"
 #[2,] "49,0" "107,0" "0,22"
 #[3,] "9,4"  "33,13" "13,0"

或者

library(stringi)
`dim<-`(stri_extract(data, regex='(?<=:)[0-9]+,[0-9]+'), dim(data))
 #     [,1]   [,2]    [,3]  
 #[1,] "1,7"  "18,4"  "27,0"
 #[2,] "49,0" "107,0" "0,22"
 #[3,] "9,4"  "33,13" "13,0"

更新

 data1[] <- sub('[^:]+:([^:]+).*', '\\1', data1)
 data1[!grepl(',', data1)] <- '0,0'
 data1
 #   Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
 #[1,] "1,7"               "18,4"              "27,0"             
 #[2,] "49,0"              "107,0"             "0,22"             
 #[3,] "42,0"              "0,0"               "35,88" 

数据

 data <- structure(c("./.:1,7:8:18:262,0,18", "0/0:49,0:49:99:0,147,1891", 
 "0/1:9,4:13:99:129,0,334", "0/1:18,4:21:56:56,0,591",
  "0/0:107,0:107:99:0,319,4185", 
 "0/1:33,13:44:99:317,0,1150", "0/0:27,0:27:78:0,78,723", 
 "1/1:0,22:22:66:902,66,0", "0/0:13,0:13:39:0,39,528"), .Dim = c(3L, 3L), 
 .Dimnames = list(NULL, c("Offspring.95_CAATCG", "Offspring.96_AAACGG", 
 "Offspring.97_ACTCTT")))

 data1 <- structure(c("./.:1,7:8:18:262,0,18", "0/0:49,0:49:99:0,147,1891", 
 "0/0:42,0:42:99:0,126,1324", "0/1:18,4:21:56:56,0,591",
    "0/0:107,0:107:99:0,319,4185", 
  "./.:.:.:.:.", "0/0:27,0:27:78:0,78,723", "1/1:0,22:22:66:902,66,0", 
  "0/1:35,88:117:99:3152,0,718"), .Dim = c(3L, 3L), .Dimnames = list(
   NULL, c("Offspring.95_CAATCG", "Offspring.96_AAACGG", "Offspring.97_ACTCTT"
   )))

答案 1 :(得分:1)

不是正则表达式,但可能非常快。

apply(data, 2, function(x) sapply(strsplit(x, ":"), "[[", 2))

##      Offspring.95_CAATCG Offspring.96_AAACGG Offspring.97_ACTCTT
## [1,] "1,7"               "18,4"              "27,0"             
## [2,] "49,0"              "107,0"             "0,22"             
## [3,] "9,4"               "33,13"             "13,0" 

答案 2 :(得分:0)

试试这个:

out<-list()
for(i in seq(ncol(data)))
    out[[i]]<-gsub('[^:]*:([0-9]+,[0-9]+).*','\\1',data[,i])
out<-as.data.frame(out)
dimnames(out)<-dimnames(data)
out