如何从多个字符串中取值

时间:2017-12-04 17:47:03

标签: r string

我有以下数据

df<-structure(list(X__1 = c("Q6NVC0 [197-213]", "A0A0A8UAS1 [48-64]", 
"A0A0F0XK65 [8-17]", "A0A0G2JMX7 [481-491]; P20406-6 [106-116]; Q6UIN5 [7-17]; P20406-3 [70-80]; P20406-8 [164-174]", 
"A0A0G2JMX7 [481-497]; P20406-6 [106-122]; Q6UIN5 [7-23]; P20406-3 [70-86]; P20406-8 [164-180]", 
"P20406-6 [106-132]; Q6UIN5 [7-33]; P20406-3 [70-96]; P20406-8 [164-190]", 
"H6VRG2 [603-616]", "P13645 [41-59]", "P35527 [488-513]", "P35908 [525-544]; H2R1Z0 [512-531]", 
"H6VRG2 [550-588]", "A0A024RBS2 [150-162]", "A0A023L3M5 [83-110]", 
"P10809 [143-156]", "V9HW22 [470-493]", "P20406-8 [379-392]", 
"P19338 [524-537]", "A0A024L4P7 [46-57]", "G7YYH5 [45-55]", "Q86YZ3 [973-991]; [1443-1461]; [1913-1931]; [2383-2401]", 
"P35527 [375-390]", "P35527 [375-391]", "D2TTS3 [41-50]")), .Names = "X__1", class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -23L))

我正在尝试制作这样的输出 首先,我尝试获取两个ID P20406-8和A0A024RBS2旁边的值。如果它们不存在,我们将获取任何其他ID的前两个值。如果它们一起存在,我们取第一个值旁边的值,如果其中一个ID存在,我们就取其值。

所以最后我想要两列值,如下面的

197 213
48  64
8   17
164 174
164 180
164 190
603 616
41  59
488 513
525 544
550 588
150 162
83  110
143 156
470 493
379 392
524 537
46  57
45  55
973 991
375 390
375 391
41  50

2 个答案:

答案 0 :(得分:1)

这段代码对我来说很合适:

df_new <- df2$X__1
primary_ID <- "P20406-8"
secondary_ID <- "A0A024RBS2"

ID_Primary <- grepl(pattern = primary_ID, df_new)
ID_Secondary <- grepl(pattern = secondary_ID, df_new)

df_new[ID_Primary] <- substr(df_new[ID_Primary],
                              start = regexpr(primary_ID, df_new[ID_Primary]), 
                              stop = 2000)

df_new[ID_Secondary & !ID_Primary] <- substr(df_new[ID_Secondary],
                                               start = regexpr(secondary_ID, df_new[ID_Secondary]), 
                                               stop = 2000)

start <- regexpr(pattern = "\\[", text = df_new)  # find opening square bracket
end <- regexpr(pattern = "\\]", text = df_new)  # find closing square bracket
temp <- substr(df_new, start+1, end-1)  # only take values between opening and closing
temp <- strsplit(temp, "-")  # split values in two

result <- t(data.frame(temp))
rownames(result) <- NULL

我希望这有帮助!

答案 1 :(得分:1)

一种方法:

library(splitstackshape)
# separate multi observation lines into multiple columns
df2 <- cSplit(df, splitCols='X__1', sep=';')
# put in a long format for easier processing
df3 <- na.omit(data.frame(melt(as.data.frame(df2),measure=names(df2))))

# extractions
# account id
df3$id <- with(df3,gsub(x=value,pattern="^(.*) [[:punct:]](.*)-(.*)[[:punct:]]$",replacement='\\1'))
# start range
df3$begin <- with(df3,gsub(x=value,pattern="^(.*) [[:punct:]](.*)-(.*)[[:punct:]]$",replacement='\\2'))
# end range
df3$end <- with(df3,gsub(x=value,pattern="^(.*) [[:punct:]](.*)-(.*)[[:punct:]]$",replacement='\\3'))
# accounts with multiple:
df3$mult_id <- as.numeric(with(df3,gsub(x=variable,pattern="^X__1_(.*)$",replacement='\\1')))


# apparently you only want those with one obs:
df.final <- df3[which(df4$mult_id==1),]