我有一个巨大的文本文件,其中包含化学物质的记录。每个条目均以“ * NEWRECORD”开头,并以空白行结尾。我不知道那里有多少条记录。每个记录的行各不相同。如何将每条记录另存为单独的数据框?
以下是我的文本文件示例
imported_data <- c("*NEWRECORD
SH = diagnostic imaging
QE = DIAG IMAGE
QA = DG
QT = 1
QX = X-ray|NRW
UI = Q000000981
*NEWRECORD
RECTYPE = Q
SH = analogs & derivatives
QE = ANALOGS
QA = AA
QT = 1
*NEWRECORD
RECTYPE = Q
SH = abnormalities
QE = ABNORM
QX = agenesis|NRW
QX = anomalies|EQV
QX = aplasia|NRW
QX = atresia|NRW
QX = birth defects|NRW
QX = congenital defects|NRW
QX = defects|NRW
QX = deformities|NRW
QX = hypoplasia|NRW
UI = Q000002")
# What I expect is
# Table_1
# SH QE QA QT QX UI
# diagnostic imaging DIAG IMAGE DG 1 X-ray|NRW Q000000981
# Table_2
# RECTYPE SH QE QA QT
# Q analogs & derivatives ANALOGS AA 1
# and so on ...
答案 0 :(得分:1)
library(stringi)
library(purrr)
lines <- readLines("~/Data/so.txt")
head(lines)
# find start/end positions of all the records
starts <- which(stri_detect_fixed(lines, "*NEWRECORD"))
ends <- c(starts[-1], length(lines))
map2(starts, ends, ~{
# extract the bits to work on
rec <- stri_trim_both(lines[.x:.y])
# filter out unimportant bits
rec <- rec[!(stri_detect_regex(rec, "^$|NEWRECORD"))]
# get the field/value pairs
rec <- stri_split_regex(rec, "[[:space:]]*=[[:space:]]*", simplify = FALSE)
# make a container
out <- list()
# add to the container, increasing vector size of an element if necessary
for(r in rec) out[[ r[1] ]] <- c(out[[ r[1] ]], r[2])
# assume only QX is repeated since the OP neglected to provide that info
# also assume QX should be a list column since the OP also neglected to provide that info
if ("QX" %in% names(out)) out[["QX"]] <- list(out[["QX"]])
as_data_frame(out)
})
哪个会产生:
## [[1]]
## # A tibble: 1 x 6
## SH QE QA QT QX UI
## <chr> <chr> <chr> <chr> <list> <chr>
## 1 diagnostic imaging DIAG IMAGE DG 1 <chr [1]> Q000000981
##
## [[2]]
## # A tibble: 1 x 5
## RECTYPE SH QE QA QT
## <chr> <chr> <chr> <chr> <chr>
## 1 Q analogs & derivatives ANALOGS AA 1
##
## [[3]]
## # A tibble: 1 x 5
## RECTYPE SH QE QX UI
## <chr> <chr> <chr> <list> <chr>
## 1 Q abnormalities ABNORM <chr [9]> Q000002
我们还可以获得一个大数据框(嵌套QX
列):
(map2_df(starts, ends, ~{
rec <- stri_trim_both(lines[.x:.y])
rec <- rec[!(stri_detect_regex(rec, "^$|NEWRECORD"))]
rec <- stri_split_regex(rec, "[[:space:]]*=[[:space:]]*", simplify = FALSE)
out <- list()
for(r in rec) out[[ r[1] ]] <- c(out[[ r[1] ]], r[2])
if ("QX" %in% names(out)) out[["QX"]] <-list(out[["QX"]])
as_data_frame(out)
}) -> xdf)
## # A tibble: 3 x 7
## SH QE QA QT QX UI RECTYPE
## <chr> <chr> <chr> <chr> <list> <chr> <chr>
## 1 diagnostic imaging DIAG IMAGE DG 1 <chr [1]> Q000000… NA
## 2 analogs & derivatives ANALOGS AA 1 <NULL> NA Q
## 3 abnormalities ABNORM NA NA <chr [9]> Q000002 Q
或者采用^^并取消嵌套QX
列:
# one row per QX
mutate(xdf, QX = map(QX, ~if (is.null(.x)) NA_character_ else .x)) %>%
unnest(QX)
## # A tibble: 11 x 7
## SH QE QA QT UI RECTYPE QX
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 diagnostic imaging DIAG IMAGE DG 1 Q000000981 NA X-ray|NRW
## 2 analogs & derivatives ANALOGS AA 1 NA Q NA
## 3 abnormalities ABNORM NA NA Q000002 Q agenesis|NRW
## 4 abnormalities ABNORM NA NA Q000002 Q anomalies|EQV
## 5 abnormalities ABNORM NA NA Q000002 Q aplasia|NRW
## 6 abnormalities ABNORM NA NA Q000002 Q atresia|NRW
## 7 abnormalities ABNORM NA NA Q000002 Q birth defects|NRW
## 8 abnormalities ABNORM NA NA Q000002 Q congenital defects|NRW
## 9 abnormalities ABNORM NA NA Q000002 Q defects|NRW
## 10 abnormalities ABNORM NA NA Q000002 Q deformities|NRW
## 11 abnormalities ABNORM NA NA Q000002 Q hypoplasia|NRW
或者,返回并制作一个QX
展开的单个数据框:
map2_df(starts, ends, ~{
rec <- stri_trim_both(lines[.x:.y])
rec <- rec[!(stri_detect_regex(rec, "^$|NEWRECORD"))]
rec <- stri_split_regex(rec, "[[:space:]]*=[[:space:]]*", simplify = TRUE)
as.list(set_names(rec[,2], make.names(rec[,1], unique=TRUE)))
})
## # A tibble: 3 x 15
## SH QE QA QT QX UI RECTYPE QX.1 QX.2 QX.3 QX.4 QX.5 QX.6 QX.7 QX.8
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 diagn… DIAG … DG 1 X-ray… Q0000… NA NA NA NA NA NA NA NA NA
## 2 analo… ANALO… AA 1 NA NA Q NA NA NA NA NA NA NA NA
## 3 abnor… ABNORM NA NA agene… Q0000… Q anom… apla… atre… birt… cong… defe… defo… hypo…
答案 1 :(得分:1)
这也许是解决方案的起点:
library(tidyverse)
imported_data %>% str_split("\\*NEWRECORD") -> l
l[[1]][-1] %>%
purrr::map(
function(x) data.frame(z=str_split(x,"\n")[[1]][-1]) %>%
filter(str_detect(z,"="))
) %>%
purrr::map(
function(x) separate(x,z,c("k","v")," = ",extra="merge") %>%
mutate(k=str_replace_all(k," ",""))
)
#[[1]]
# k v
#1 SH diagnostic imaging
#2 QE DIAG IMAGE
#3 QA DG
#4 QT 1
#5 QX X-ray|NRW
#6 UI Q000000981
#[[2]]
# k v
#1 RECTYPE Q
#2 SH analogs & derivatives
#3 QE ANALOGS
#4 QA AA
#5 QT 1
#[[3]]
# k v
#1 RECTYPE Q
#2 SH abnormalities
#3 QE ABNORM
#4 QX agenesis|NRW
#5 QX anomalies|EQV
#6 QX aplasia|NRW
#7 QX atresia|NRW
#8 QX birth defects|NRW
#9 QX congenital defects|NRW
#10 QX defects|NRW
#11 QX deformities|NRW
#12 QX hypoplasia|NRW
#13 UI Q000002
要从所有这些数据中仅获取一个数据帧,可以选择以下选项:
imported_data %>%
str_split("\\*NEWRECORD") -> l
l[[1]][-1] %>%
purrr::map(function(x) data.frame(z=str_split(x,"\n")[[1]][-1]) %>%
filter(str_detect(z,"="))) %>%
purrr::map(function(x) separate(x,z,c("k","v")," = ",extra="merge") %>%
mutate(k=str_replace_all(k," ","")) %>%
group_by(k) %>%
summarise(v= paste(v,collapse=", ")) %>%
spread(k,v)
) %>% purrr::reduce(bind_rows)
## A tibble: 3 x 7
# QA QE QT QX SH UI RECTYPE
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 DG DIAG IMAGE 1 X-ray|NRW diagnostic imaging Q0000009~ <NA>
#2 AA ANALOGS 1 <NA> analogs & derivativ~ <NA> Q
#3 <NA> ABNORM <NA> agenesis|NRW, anomalies|EQV, aplasia|NRW, atresia|NRW, birth defects|NRW, congenital defects|NRW, defects|NRW, deformities|NRW, hypo~ abnormalities Q000002 Q
答案 2 :(得分:0)
保留上面的答案即可为其他人“展示作品”,但是您现在可以这样做:
devtools::install_github("hrbrmstr/whatamesh")
,然后执行此操作以列出站点上的文件:
list_mesh_files()
## # A tibble: 3 x 3
## `Size (Bytes)` `Last Modified` File
## <chr> <chr> <chr>
## 1 98499407 Oct 15 04:38 c2018.bin
## 2 29052512 Jul 16 04:31 d2018.bin
## 3 85614 Jul 16 04:30 q2018.bin
如果文件是本地文件,它将在本地读取,否则将下载并读取:
read_mesh_file("q2018.bin")
## # A tibble: 79 x 16
## RECTYPE SH QE QA QT MS AN HN QX DA MR DQ UI OL TN QS
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lis> <chr> <chr> <chr> <chr> <chr> <lis> <chr>
## 1 Q diagnostic imaging DIAG… DG 1 Used… subh… 2017… <chr… 2016… 2016… 2017… Q000… <NA> <NUL… <NA>
## 2 Q abnormalities ABNO… AB 1 Used… subh… 66; … <chr… 1973… 2015… 1966… Q000… sear… <chr… <NA>
## 3 Q administration & dosage ADMIN AD 1 Used… "sub… 66; … <chr… 1973… 2017… 1966… Q000… sear… <chr… ADMI…
## 4 Q adverse effects ADV … AE 1 Used… subh… 66; … <chr… 1973… 2017… 1966… Q000… sear… <chr… <NA>
## 5 Q analogs & derivatives ANAL… AA 1 Used… subh… 75; … <chr… 1974… 2017… 1975… Q000… sear… <chr… <NA>
## 6 Q analysis ANAL AN 1 "Use… "sub… 67; … <chr… 1973… 2003… 1967… Q000… sear… <chr… <NA>
## 7 Q anatomy & histology ANAT AH 1 Used… subh… 66; … <chr… 1973… 2017… 1966… Q000… sear… <chr… ANAT…
## 8 Q antagonists & inhibitors ANTAG AI 1 Used… subh… 68; … <chr… 1973… 2017… 1968… Q000… sear… <chr… <NA>
## 9 Q biosynthesis BIOS… BI 1 Used… "sub… 66; … <chr… 1973… 2003… 1966… Q000… sear… <chr… <NA>
## 10 Q blood BLOOD BL 1 "Use… "sub… 67; … <NUL… 1973… 2003… 1967… Q000… sear… <chr… <NA>
## # ... with 69 more rows
它也支持“宽”格式:
read_mesh_file("q2018.bin", wide = TRUE)
## # A tibble: 79 x 108
## RECTYPE SH QE QA QT MS AN HN QX QX.1 QX.2 QX.3 QX.4 QX.5 QX.6 QX.7 QX.8 QX.9 QX.10
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Q diagn… DIAG… DG 1 Used… subh… 2017… X-ra… X-ra… X-ra… echo… echo… radi… radi… radi… roen… ultr… ultr…
## 2 Q abnor… ABNO… AB 1 Used… subh… 66; … agen… anom… apla… atre… birt… cong… defe… defo… hypo… malf… <NA>
## 3 Q admin… ADMIN AD 1 Used… "sub… 66; … admi… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 Q adver… ADV … AE 1 Used… subh… 66; … side… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 Q analo… ANAL… AA 1 Used… subh… 75; … anal… deri… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 Q analy… ANAL AN 1 "Use… "sub… 67; … assa… chem… dete… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 7 Q anato… ANAT AH 1 Used… subh… 66; … anat… anat… hist… morp… <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 8 Q antag… ANTAG AI 1 Used… subh… 68; … anta… anta… inhi… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 9 Q biosy… BIOS… BI 1 Used… "sub… 66; … anab… biof… <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 10 Q blood BLOOD BL 1 "Use… "sub… 67; … <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## # ... with 69 more rows, and 89 more variables: QX.11 <chr>, DA <chr>, MR <chr>, DQ <chr>, UI <chr>, OL <chr>,
## # TN <chr>, TN.1 <chr>, TN.2 <chr>, TN.3 <chr>, TN.4 <chr>, TN.5 <chr>, TN.6 <chr>, TN.7 <chr>, TN.8 <chr>,
## # TN.9 <chr>, TN.10 <chr>, TN.11 <chr>, TN.12 <chr>, QS <chr>, TN.13 <chr>, TN.14 <chr>, TN.15 <chr>, TN.16 <chr>,
## # TN.17 <chr>, TN.18 <chr>, TN.19 <chr>, TN.20 <chr>, TN.21 <chr>, TN.22 <chr>, TN.23 <chr>, TN.24 <chr>,
## # TN.25 <chr>, TN.26 <chr>, TN.27 <chr>, TN.28 <chr>, TN.29 <chr>, TN.30 <chr>, TN.31 <chr>, TN.32 <chr>,
## # TN.33 <chr>, TN.34 <chr>, TN.35 <chr>, TN.36 <chr>, TN.37 <chr>, TN.38 <chr>, TN.39 <chr>, TN.40 <chr>,
## # TN.41 <chr>, TN.42 <chr>, TN.43 <chr>, TN.44 <chr>, TN.45 <chr>, TN.46 <chr>, TN.47 <chr>, TN.48 <chr>,
## # TN.49 <chr>, TN.50 <chr>, TN.51 <chr>, TN.52 <chr>, TN.53 <chr>, TN.54 <chr>, TN.55 <chr>, TN.56 <chr>,
## # TN.57 <chr>, TN.58 <chr>, TN.59 <chr>, TN.60 <chr>, TN.61 <chr>, TN.62 <chr>, TN.63 <chr>, TN.64 <chr>,
## # TN.65 <chr>, TN.66 <chr>, TN.67 <chr>, TN.68 <chr>, TN.69 <chr>, TN.70 <chr>, TN.71 <chr>, TN.72 <chr>,
## # TN.73 <chr>, TN.74 <chr>, TN.75 <chr>, TN.76 <chr>, TN.77 <chr>, TN.78 <chr>, QX.12 <chr>, QX.13 <chr>, QX.14 <chr>
而且,它也可以与其他两个大文件一起使用(d / l需要一段时间,而解析它们需要一段时间)。
它应该处理MeSH格式的所有已定义重复列。
如果有任何问题请file issues,然后在此处发布。
答案 3 :(得分:-1)
这是一种方法:
library(dplyr)
split1 <- strsplit(imported_data, "\\*NEWRECORD") %>% unlist()
split2 <- split1
if(sum (0==nchar(split1))>0) split2 <- split1[-which(0==nchar(split1))]
split3 <- strsplit(split2, "\\n")
split4 <- lapply(split3, function(x) strsplit(x, " = "))
split5 <- split4[lapply(split4,length)>0] #lapply(split4, function(x) if(sum(length(x)==0)>0) {x[-which(length(x)==0)]} else x )
lapply(split4, nchar)
length(split4[[3]][[2]])
is.null( split4[[3]][[1]])
split5 <- lapply(split4, function(x) ifelse(lapply(x, function(y) length(y) >0), x, ""))
split6 <- lapply(split5, function(x) lapply(x, function(y) gsub(" ", "", y)))
split7 <- lapply(split6, function(x) x[nzchar(x)])
lapply(split6, length)
dim1 <- length(split7)
dim2 <- lapply(split7, length) %>% unlist()
dim3 <- lapply(split7, function(x) lapply(x, length) ) %>% unlist()
listres <- list()
for( i in 1:length(dim2)) {
mat <- matrix(NA, dim2[i], 2)
mat <- data.frame(matrix(unlist(split7[[i]]), nrow= dim2[i], byrow=T))# %>% t()
dim(mat)
names(mat) <- c("varname", "value")
mat <- t(mat)
colnames(mat) <- mat[1,]
mat <- mat[-1,]
listres[[i]] <- mat
}
listres
[[1]]
SH QE QA QT
"diagnosticimaging" "DIAGIMAGE" "DG" "1"
QX UI
"X-ray|NRW" "Q000000981"
[[2]]
RECTYPE SH QE
"Q" "analogs&derivatives" "ANALOGS"
QA QT
"AA" "1"
[[3]]
RECTYPE SH QE
"Q" "abnormalities" "ABNORM"
QX QX QX
"agenesis|NRW" "anomalies|EQV" "aplasia|NRW"
QX QX QX
"atresia|NRW" "birthdefects|NRW" "congenitaldefects|NRW"
QX QX QX
"defects|NRW" "deformities|NRW" "hypoplasia|NRW"
UI
"Q000002"
您现在有了一个包含指定n个数据帧的列表。
编辑:您的帖子中的表1使用data.frame(listres[[1]]) %>% t()
SH QE QA QT
"diagnosticimaging" "DIAGIMAGE" "DG" "1"
QX UI
"X-ray|NRW" "Q000000981"
其他数据帧依此类推。