消除数据帧行中的特殊字符

时间:2015-11-19 21:48:40

标签: r dataframe

我有以下数据框(酸):

retrieveDeviceNode()

所需的结果如下:

"sequence" "support"
"1" "<{\"OV188\"}>" 0.628465804066543
"2" "<{\"OV191\"}>" 0.584103512014787
"3" "<{\"OV194\"}>" 0.584103512014787
"4" "<{\"OV195\"}>" 0.680221811460259
"5" "<{\"OV197\"}>" 0.584103512014787
"6" "<{\"OV200\"}>" 0.56007393715342
"7" "<{\"OV188\"},{\"OV200\"}>" 0.56007393715342
"8" "<{\"OV191\"},{\"OV197\"}>" 0.584103512014787
"9" "<{\"OV194\"},{\"OV197\"}>" 0.584103512014787
"10" "<{\"OV195\"},{\"OV197\"}>" 0.584103512014787
"11" "<{\"OV194\"},{\"OV195\"},{\"OV197\"}>" 0.584103512014787
"12" "<{\"OV191\"},{\"OV195\"},{\"OV197\"}>" 0.584103512014787
"13" "<{\"OV191\"},{\"OV194\"},{\"OV195\"},{\"OV197\"}>" 0.584103512014787
"14" "<{\"OV191\"},{\"OV194\"},{\"OV197\"}>" 0.584103512014787
"15" "<{\"OV191\"},{\"OV195\"}>" 0.584103512014787
"16" "<{\"OV194\"},{\"OV195\"}>" 0.584103512014787
"17" "<{\"OV191\"},{\"OV194\"},{\"OV195\"}>" 0.584103512014787
"18" "<{\"OV191\"},{\"OV194\"}>" 0.584103512014787

我使用以下代码:

"sequence" "support"
"1" "<{OV188}>" 0.628465804066543
"2" "<{OV191}>" 0.584103512014787
"3" "<{OV194}>" 0.584103512014787
"4" "<{OV195}>" 0.680221811460259
"5" "<{OV197}>" 0.584103512014787
"6" "<{OV200}>" 0.56007393715342
"7" "<{OV188},{OV200}>" 0.56007393715342
"8" "<{OV191},{OV197}>" 0.584103512014787
"9" "<{OV194},{OV197}>" 0.584103512014787
"10" "<{OV195},{OV197}>" 0.584103512014787
"11" "<{OV194},{OV195},{OV197}>" 0.584103512014787
"12" "<{OV191},{OV195},{OV197}>" 0.584103512014787
"13" "<{OV191},{OV194},{OV195},{OV197}>" 0.584103512014787
"14" "<{OV191},{OV194},{OV197}>" 0.584103512014787
"15" "<{OV191},{OV195}>" 0.584103512014787
"16" "<{OV194},{OV195}>" 0.584103512014787
"17" "<{OV191},{OV194},{OV195}>" 0.584103512014787
"18" "<{OV191},{OV194}>" 0.584103512014787

但是,如果此处显示的“序列”列中有多个值(从第7行开始),我仍然会收到不需要的字符:

a<-sour
names(a) <- sub("X\\.(.*)\\.", "\\1",  names(a))
a$sequence <- sub('<\\{"(.*)"\\}>', "<{\\1}>", a$sequence)
sour<-a

我需要更改代码,以便第7行到第18行没有第1-6行中的引号和斜杠?

"sequence" "support"
"1" "<{OV188}>" 0.628465804066543
"2" "<{OV191}>" 0.584103512014787
"3" "<{OV194}>" 0.584103512014787
"4" "<{OV195}>" 0.680221811460259
"5" "<{OV197}>" 0.584103512014787
"6" "<{OV200}>" 0.56007393715342
"7" "<{OV188\"},{\"OV200}>" 0.56007393715342
"8" "<{OV191\"},{\"OV197}>" 0.584103512014787
"9" "<{OV194\"},{\"OV197}>" 0.584103512014787
"10" "<{OV195\"},{\"OV197}>" 0.584103512014787
"11" "<{OV194\"},{\"OV195\"},{\"OV197}>" 0.584103512014787
"12" "<{OV191\"},{\"OV195\"},{\"OV197}>" 0.584103512014787
"13" "<{OV191\"},{\"OV194\"},{\"OV195\"},{\"OV197}>" 0.584103512014787
"14" "<{OV191\"},{\"OV194\"},{\"OV197}>" 0.584103512014787
"15" "<{OV191\"},{\"OV195}>" 0.584103512014787
"16" "<{OV194\"},{\"OV195}>" 0.584103512014787
"17" "<{OV191\"},{\"OV194\"},{\"OV195}>" 0.584103512014787
"18" "<{OV191\"},{\"OV194}>" 0.584103512014787

1 个答案:

答案 0 :(得分:1)

怎么样

library(dplyr)
library(stringi)

sour %>%
  mutate(sequence = 
           sequence %>% 
           stri_replace_all_fixed('"', '') %>%
           shQuote(type = "cmd"))

我还建议删除序列。

library(tidyr)

sour %>%
  mutate(sequence = sequence %>% stri_split_fixed(",")) %>%
  unnest(sequence) %>%
  mutate(sequence = sequence %>% extract_numeric)