我有一个相当可怕的HTML格式的文本文件:
A<b>Metabolism</b>
B
B <b>Overview</b>
C 01200 Carbon metabolism [PATH:bpe01200]
D BP3142 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1519 fba; fructose-1,6-bisphosphate aldolase K01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D BP0801 tpiA; triosephosphate isomerase K01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D BP1000 gap; glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
我想将此文件解析为R。
中的列如:
A,Metabolism
B,
B,Overview
C,01200,Carbon metabolism,Path,bpe01200
D,BP3142,Pgi,glucose-6-phosphate isomerase,GPI,glucose-6-phosphate isomerase,[EC:5.3.1.9]
...
D,BP1000,gap,glyceraldehyde-3-phosphate dehydrogenase,K00134,GAPDH,glyceraldehyde 3-phosphate dehydrogenase,[EC:1.2.1.12]
问题是分隔符在行的每个部分都会发生变化。 它似乎遵循这种模式 e.g
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
^Tab ^space^Semi colon ^tab ^space^semi colon
我可以想到不那么聪明的方法。通过一次解析1个分隔符。但有没有人有任何智能解决方案?或者知道一个可以很好地解释这个的工具?
我真的很感激一些帮助:)
由于
答案 0 :(得分:2)
library(stringr)
library(purrr)
file <- "A<b>Metabolism</b>
B
B <b>Overview</b>
C\t01200 Carbon metabolism [PATH:bpe01200]
D\tBP3142 pgi; glucose-6-phosphate isomerase\tK01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D\tBP1971 pgi; glucose-6-phosphate isomerase\tK01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D\tBP1519 fba; fructose-1,6-bisphosphate aldolase\tK01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D\tBP0801 tpiA; triosephosphate isomerase\tK01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D\tBP1000 gap; glyceraldehyde-3-phosphate dehydrogenase\tK00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
This line is to check behavior when parsing fails."
cat(file)
data <- readLines(con = textConnection(file))
# Pattern to capture "A<b>Metabolism</b>" for instance
pattern_1 <- "^(\\w+)\\h*<b>\\h*(\\w+)\\h*</b>\\h*$"
# Pattern to capture "B" for instance
pattern_2 <- "^(\\w+)$"
# Pattern to capture "C\t01200 Carbon metabolism [PATH:bpe01200]" for instance
pattern_3 <- "^(\\w+)\\t+(\\w+)\\s+([^\\[\\t;]*)\\h*(\\[[^\\]]*\\])$"
# Pattern to capture "D\tBP3142 pgi; glucose-6-phosphate isomerase\tK01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]" for instance
pattern_4 <- "^(\\w+)\\t+(\\w+)\\s+(\\w+);\\h*([^\\t]*)\\t+(\\w+)\\s+(\\w+);\\h*([^\\[]*)\\h*(\\[[^\\]]*\\])$"
# Some more explanations:
# Parens wrap groups to extract
# "\\w+" matches words
# "\\t+", "\\s+" or ";\\h*" are specific separators of OP's original data
# "([^\\t]*)" matches anything until the next tab separator
# Convoluted patterns such as "(\\[[^\\]]*\\])" extract whatever is inside brackets
patterns <- mget(paste0("pattern_", 1:4))
# A list of the data parsed 4 times, once for each pattern:
patterns %>%
map(~ {
extraction <- str_match(data, .x)
cbind(match = !is.na(extraction[, 1]), extraction[, - 1])
})
# This is closer to your desired output: a list of [un]parsed rows:
data %>%
map(~ {
# Find the first pattern that matches. 0 if none does
pattern_index <- detect_index(patterns, grepl, .x, perl = TRUE)
# If failed to parse, return original row as length 1 character vector. Else return parsed row as character vector
if (pattern_index == 0L) .x else str_match(.x, get(paste0("pattern_", pattern_index)))[- 1]
})
输出主管看起来像这样:
list(c("A", "Metabolism"), "B", c("B", "Overview"), c("C", "01200",
"Carbon metabolism ", "[PATH:bpe01200]"), c("D", "BP3142", "pgi",
"glucose-6-phosphate isomerase", "K01810", "GPI", "glucose-6-phosphate isomerase ",
"[EC:5.3.1.9]"))
答案 1 :(得分:1)
text <- "
A<b>Metabolism</b>
B
B <b>Overview</b>
C 01200 Carbon metabolism [PATH:bpe01200]
D BP3142 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1519 fba; fructose-1,6-bisphosphate aldolase K01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D BP0801 tpiA; triosephosphate isomerase K01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D BP1000 gap; glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
"
library(stringr)
# get the header items (beginning with C blank)
headers <- str_match(text, "C\\s+(.+)\n")[,2]
header_items <- trimws(str_match(headers, "(\\d+)\\s+([^\\[]+)(.+)")[2:4])
# get the detail items (liens beginning with D blank)
details <- str_match_all(text, "D\\s+(.+)\n")[[1]][,2]
# parse each item within detail
# split on ";" and organize into dataframe
items <- as.data.frame(t(data.frame(
str_split(details,";\\s")
)), row.names = 1:length(details), stringsAsFactors = FALSE)
# parse each part using pattern matches
# capture () beginning of string ^ and all characters not whitespace [^\\s]+
items$V1A <- str_match(items$V1,"(^[^\\s]+)")[,2]
# capture () end of string $ and a non-whitespace sequence [^\\s]+
items$V1B <- str_match(items$V1,"([^\\s]+)$")[,2]
# capture () beginning of string exluding two non-whitespace sequences [^\\s]+ at end $
items$V2A <- str_match(items$V2,"^(.+)\\s[^\\s]+\\s[^\\s]+$")[,2]
# capture () non-whitespace sequence [^\\s]+ at end of string $
items$V2C <- str_match(items$V2,"([^\\s]+)$")[,2]
# capture () second to last non-whitespace sequence [^\\s]+ at end of string $
items$V2B <- str_match(items$V2,"([^\\s]+)\\s[^\\s]+$")[,2]
# capture () begining of string ^ excluding last non-whitespace sequence [^\\s]+
items$V3A <- str_match(items$V3,"^(.+)\\s[^\\s]+$")[,2]
# capture () non-whitespace sequence at end $
items$V3B <- str_match(items$V3,"([^\\s]+)$")[,2]
select & reorder
items <- items[, c("V1A", "V1B", "V2A", "V2B", "V2C", "V3A", "V3B")]
项
# V1A V1B V2A V2B V2C V3A V3B
#1 BP3142 pgi glucose-6-phosphate isomerase K01810 GPI glucose-6-phosphate isomerase [EC:5.3.1.9]
#2 BP1971 pgi glucose-6-phosphate isomerase K01810 GPI glucose-6-phosphate isomerase [EC:5.3.1.9]
#3 BP1519 fba fructose-1,6-bisphosphate aldolase K01624 FBA fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
#4 BP0801 tpiA triosephosphate isomerase K01803 TPI triosephosphate isomerase (TIM) [EC:5.3.1.1]
#5 BP1000 gap glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
答案 2 :(得分:1)
一个更简单的版本,只在一场比赛中使用相同的正则表达式字符串提取细节
text <- "
A<b>Metabolism</b>
B
B <b>Overview</b>
C 01200 Carbon metabolism [PATH:bpe01200]
D BP3142 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1971 pgi; glucose-6-phosphate isomerase K01810 GPI; glucose-6-phosphate isomerase [EC:5.3.1.9]
D BP1519 fba; fructose-1,6-bisphosphate aldolase K01624 FBA; fructose-bisphosphate aldolase, class II [EC:4.1.2.13]
D BP0801 tpiA; triosephosphate isomerase K01803 TPI; triosephosphate isomerase (TIM) [EC:5.3.1.1]
D BP1000 gap; glyceraldehyde-3-phosphate dehydrogenase K00134 GAPDH; glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12]
"
library(stringr)
# get the detail items (liens beginning with D blank)
details <- str_match_all(text, "D\\s+(.+)\n")[[1]][,2]
details
pattern <- "([^\\s]+)\\s([^\\s]+);(.*)\\s([^\\s]+)\\s([^\\s]+);\\s(.*)\\s([^\\s]+)$"
trimws(str_match(details, pattern)[,-1])
#[,1] [,2] [,3] [,4] [,5]
#[1,] "BP3142" "pgi" "glucose-6-phosphate isomerase" "K01810" "GPI"
#[2,] "BP1971" "pgi" "glucose-6-phosphate isomerase" "K01810" "GPI"
#[3,] "BP1519" "fba" "fructose-1,6-bisphosphate aldolase" "K01624" "FBA"
#[4,] "BP0801" "tpiA" "triosephosphate isomerase" "K01803" "TPI"
#[5,] "BP1000" "gap" "glyceraldehyde-3-phosphate dehydrogenase" "K00134" "GAPDH"
# [,6] [,7]
#[1,] "glucose-6-phosphate isomerase" "[EC:5.3.1.9]"
#[2,] "glucose-6-phosphate isomerase" "[EC:5.3.1.9]"
#[3,] "fructose-bisphosphate aldolase, class II" "[EC:4.1.2.13]"
#[4,] "triosephosphate isomerase (TIM)" "[EC:5.3.1.1]"
#[5,] "glyceraldehyde 3-phosphate dehydrogenase" "[EC:1.2.1.12]"