如何在特定位置拆分.txt文件并将每个部分导入为R中的表

时间:2018-10-15 09:50:55

标签: r split

我有一个巨大的文本文件,其中包含化学物质的记录。每个条目均以“ * NEWRECORD”开头,并以空白行结尾。我不知道那里有多少条记录。每个记录的行各不相同。如何将每条记录另存为单独的数据框?

以下是我的文本文件示例

imported_data <- c("*NEWRECORD
   SH = diagnostic imaging
   QE = DIAG IMAGE
   QA = DG
   QT = 1
   QX = X-ray|NRW
   UI = Q000000981

   *NEWRECORD
   RECTYPE = Q
   SH = analogs & derivatives
   QE = ANALOGS
   QA = AA
   QT = 1

   *NEWRECORD
   RECTYPE = Q
   SH = abnormalities
   QE = ABNORM
   QX = agenesis|NRW
   QX = anomalies|EQV
   QX = aplasia|NRW
   QX = atresia|NRW
   QX = birth defects|NRW
   QX = congenital defects|NRW
   QX = defects|NRW
   QX = deformities|NRW
   QX = hypoplasia|NRW
   UI = Q000002")

# What I expect is

# Table_1
# SH                    QE         QA   QT   QX         UI
# diagnostic imaging   DIAG IMAGE  DG   1    X-ray|NRW  Q000000981

# Table_2
# RECTYPE   SH                     QE        QA     QT
# Q         analogs & derivatives  ANALOGS   AA     1

# and so on ...

4 个答案:

答案 0 :(得分:1)

library(stringi)
library(purrr)

lines <- readLines("~/Data/so.txt")

head(lines)

# find start/end positions of all the records
starts <- which(stri_detect_fixed(lines, "*NEWRECORD"))
ends <- c(starts[-1], length(lines))

map2(starts, ends, ~{

  # extract the bits to work on
  rec <- stri_trim_both(lines[.x:.y]) 

  # filter out unimportant bits
  rec <- rec[!(stri_detect_regex(rec, "^$|NEWRECORD"))]

  # get the field/value pairs  
  rec <- stri_split_regex(rec, "[[:space:]]*=[[:space:]]*", simplify = FALSE)

  # make a container
  out <- list()

  # add to the container, increasing vector size of an element if necessary
  for(r in rec) out[[ r[1] ]] <- c(out[[ r[1] ]], r[2])

  # assume only QX is repeated since the OP neglected to provide that info
  # also assume QX should be a list column since the OP also neglected to provide that info
  if ("QX" %in% names(out)) out[["QX"]] <- list(out[["QX"]])

  as_data_frame(out)

})

哪个会产生:

## [[1]]
## # A tibble: 1 x 6
##   SH                 QE         QA    QT    QX        UI        
##   <chr>              <chr>      <chr> <chr> <list>    <chr>     
## 1 diagnostic imaging DIAG IMAGE DG    1     <chr [1]> Q000000981
## 
## [[2]]
## # A tibble: 1 x 5
##   RECTYPE SH                    QE      QA    QT   
##   <chr>   <chr>                 <chr>   <chr> <chr>
## 1 Q       analogs & derivatives ANALOGS AA    1    
## 
## [[3]]
## # A tibble: 1 x 5
##   RECTYPE SH            QE     QX        UI     
##   <chr>   <chr>         <chr>  <list>    <chr>  
## 1 Q       abnormalities ABNORM <chr [9]> Q000002

我们还可以获得一个大数据框(嵌套QX列):

(map2_df(starts, ends, ~{
  rec <- stri_trim_both(lines[.x:.y]) 
  rec <- rec[!(stri_detect_regex(rec, "^$|NEWRECORD"))]
  rec <- stri_split_regex(rec, "[[:space:]]*=[[:space:]]*", simplify = FALSE)
  out <- list()
  for(r in rec) out[[ r[1] ]] <- c(out[[ r[1] ]], r[2])
  if ("QX" %in% names(out)) out[["QX"]] <-list(out[["QX"]])
  as_data_frame(out)
}) -> xdf)
## # A tibble: 3 x 7
##   SH                    QE         QA    QT    QX        UI       RECTYPE
##   <chr>                 <chr>      <chr> <chr> <list>    <chr>    <chr>  
## 1 diagnostic imaging    DIAG IMAGE DG    1     <chr [1]> Q000000… NA     
## 2 analogs & derivatives ANALOGS    AA    1     <NULL>    NA       Q      
## 3 abnormalities         ABNORM     NA    NA    <chr [9]> Q000002  Q    

或者采用^^并取消嵌套QX列:

# one row per QX
mutate(xdf, QX = map(QX, ~if (is.null(.x)) NA_character_ else .x)) %>% 
  unnest(QX)
## # A tibble: 11 x 7
##    SH                    QE         QA    QT    UI         RECTYPE QX                    
##    <chr>                 <chr>      <chr> <chr> <chr>      <chr>   <chr>                 
##  1 diagnostic imaging    DIAG IMAGE DG    1     Q000000981 NA      X-ray|NRW             
##  2 analogs & derivatives ANALOGS    AA    1     NA         Q       NA                    
##  3 abnormalities         ABNORM     NA    NA    Q000002    Q       agenesis|NRW          
##  4 abnormalities         ABNORM     NA    NA    Q000002    Q       anomalies|EQV         
##  5 abnormalities         ABNORM     NA    NA    Q000002    Q       aplasia|NRW           
##  6 abnormalities         ABNORM     NA    NA    Q000002    Q       atresia|NRW           
##  7 abnormalities         ABNORM     NA    NA    Q000002    Q       birth defects|NRW     
##  8 abnormalities         ABNORM     NA    NA    Q000002    Q       congenital defects|NRW
##  9 abnormalities         ABNORM     NA    NA    Q000002    Q       defects|NRW           
## 10 abnormalities         ABNORM     NA    NA    Q000002    Q       deformities|NRW       
## 11 abnormalities         ABNORM     NA    NA    Q000002    Q       hypoplasia|NRW    

或者,返回并制作一个QX展开的单个数据框:

map2_df(starts, ends, ~{
  rec <- stri_trim_both(lines[.x:.y]) 
  rec <- rec[!(stri_detect_regex(rec, "^$|NEWRECORD"))]
  rec <- stri_split_regex(rec, "[[:space:]]*=[[:space:]]*", simplify = TRUE)
  as.list(set_names(rec[,2], make.names(rec[,1], unique=TRUE)))
})
## # A tibble: 3 x 15
##   SH     QE     QA    QT    QX     UI     RECTYPE QX.1  QX.2  QX.3  QX.4  QX.5  QX.6  QX.7  QX.8 
##   <chr>  <chr>  <chr> <chr> <chr>  <chr>  <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 diagn… DIAG … DG    1     X-ray… Q0000… NA      NA    NA    NA    NA    NA    NA    NA    NA   
## 2 analo… ANALO… AA    1     NA     NA     Q       NA    NA    NA    NA    NA    NA    NA    NA   
## 3 abnor… ABNORM NA    NA    agene… Q0000… Q       anom… apla… atre… birt… cong… defe… defo… hypo…

答案 1 :(得分:1)

这也许是解决方案的起点:

library(tidyverse)

imported_data %>% str_split("\\*NEWRECORD") -> l
l[[1]][-1] %>%
  purrr::map(
    function(x) data.frame(z=str_split(x,"\n")[[1]][-1]) %>%
                filter(str_detect(z,"="))
  ) %>%
  purrr::map(
    function(x) separate(x,z,c("k","v")," = ",extra="merge") %>%
                mutate(k=str_replace_all(k," ",""))
  )

#[[1]]
#   k                  v
#1 SH diagnostic imaging
#2 QE         DIAG IMAGE
#3 QA                 DG
#4 QT                  1
#5 QX          X-ray|NRW
#6 UI         Q000000981

#[[2]]
#        k                     v
#1 RECTYPE                     Q
#2      SH analogs & derivatives
#3      QE               ANALOGS
#4      QA                    AA
#5      QT                     1

#[[3]]
#         k                      v
#1  RECTYPE                      Q
#2       SH          abnormalities
#3       QE                 ABNORM
#4       QX           agenesis|NRW
#5       QX          anomalies|EQV
#6       QX            aplasia|NRW
#7       QX            atresia|NRW
#8       QX      birth defects|NRW
#9       QX congenital defects|NRW
#10      QX            defects|NRW
#11      QX        deformities|NRW
#12      QX         hypoplasia|NRW
#13      UI                Q000002

要从所有这些数据中仅获取一个数据帧,可以选择以下选项:

imported_data %>% 
  str_split("\\*NEWRECORD") -> l
l[[1]][-1] %>%
   purrr::map(function(x) data.frame(z=str_split(x,"\n")[[1]][-1]) %>%
                          filter(str_detect(z,"="))) %>%
   purrr::map(function(x) separate(x,z,c("k","v")," = ",extra="merge") %>%
                          mutate(k=str_replace_all(k," ","")) %>%
                          group_by(k) %>%
                          summarise(v= paste(v,collapse=", ")) %>%
                          spread(k,v)
   ) %>% purrr::reduce(bind_rows)
## A tibble: 3 x 7
#  QA    QE         QT    QX                                                                                                                                        SH                   UI        RECTYPE
#  <chr> <chr>      <chr> <chr>                                                                                                                                 <chr>                <chr>     <chr>  
#1 DG    DIAG IMAGE 1     X-ray|NRW                                                                                                                             diagnostic imaging   Q0000009~ <NA>   
#2 AA    ANALOGS    1     <NA>                                                                                                                                  analogs & derivativ~ <NA>      Q      
#3 <NA>  ABNORM     <NA>  agenesis|NRW, anomalies|EQV, aplasia|NRW, atresia|NRW, birth defects|NRW, congenital defects|NRW, defects|NRW, deformities|NRW, hypo~ abnormalities        Q000002   Q

答案 2 :(得分:0)

保留上面的答案即可为其他人“展示作品”,但是您现在可以这样做:

devtools::install_github("hrbrmstr/whatamesh")

,然后执行此操作以列出站点上的文件:

list_mesh_files()
## # A tibble: 3 x 3
##   `Size (Bytes)` `Last Modified` File     
##   <chr>          <chr>           <chr>    
## 1 98499407       Oct 15 04:38    c2018.bin
## 2 29052512       Jul 16 04:31    d2018.bin
## 3 85614          Jul 16 04:30    q2018.bin

如果文件是本地文件,它将在本地读取,否则将下载并读取:

read_mesh_file("q2018.bin")
## # A tibble: 79 x 16
##    RECTYPE SH                       QE    QA    QT    MS    AN    HN    QX    DA    MR    DQ    UI    OL    TN    QS   
##    <chr>   <chr>                    <chr> <chr> <chr> <chr> <chr> <chr> <lis> <chr> <chr> <chr> <chr> <chr> <lis> <chr>
##  1 Q       diagnostic imaging       DIAG… DG    1     Used… subh… 2017… <chr… 2016… 2016… 2017… Q000… <NA>  <NUL… <NA> 
##  2 Q       abnormalities            ABNO… AB    1     Used… subh… 66; … <chr… 1973… 2015… 1966… Q000… sear… <chr… <NA> 
##  3 Q       administration & dosage  ADMIN AD    1     Used… "sub… 66; … <chr… 1973… 2017… 1966… Q000… sear… <chr… ADMI…
##  4 Q       adverse effects          ADV … AE    1     Used… subh… 66; … <chr… 1973… 2017… 1966… Q000… sear… <chr… <NA> 
##  5 Q       analogs & derivatives    ANAL… AA    1     Used… subh… 75; … <chr… 1974… 2017… 1975… Q000… sear… <chr… <NA> 
##  6 Q       analysis                 ANAL  AN    1     "Use… "sub… 67; … <chr… 1973… 2003… 1967… Q000… sear… <chr… <NA> 
##  7 Q       anatomy & histology      ANAT  AH    1     Used… subh… 66; … <chr… 1973… 2017… 1966… Q000… sear… <chr… ANAT…
##  8 Q       antagonists & inhibitors ANTAG AI    1     Used… subh… 68; … <chr… 1973… 2017… 1968… Q000… sear… <chr… <NA> 
##  9 Q       biosynthesis             BIOS… BI    1     Used… "sub… 66; … <chr… 1973… 2003… 1966… Q000… sear… <chr… <NA> 
## 10 Q       blood                    BLOOD BL    1     "Use… "sub… 67; … <NUL… 1973… 2003… 1967… Q000… sear… <chr… <NA> 
## # ... with 69 more rows

它也支持“宽”格式:

read_mesh_file("q2018.bin", wide = TRUE)
## # A tibble: 79 x 108
##    RECTYPE SH     QE    QA    QT    MS    AN    HN    QX    QX.1  QX.2  QX.3  QX.4  QX.5  QX.6  QX.7  QX.8  QX.9  QX.10
##    <chr>   <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 Q       diagn… DIAG… DG    1     Used… subh… 2017… X-ra… X-ra… X-ra… echo… echo… radi… radi… radi… roen… ultr… ultr…
##  2 Q       abnor… ABNO… AB    1     Used… subh… 66; … agen… anom… apla… atre… birt… cong… defe… defo… hypo… malf… <NA> 
##  3 Q       admin… ADMIN AD    1     Used… "sub… 66; … admi… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  4 Q       adver… ADV … AE    1     Used… subh… 66; … side… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  5 Q       analo… ANAL… AA    1     Used… subh… 75; … anal… deri… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  6 Q       analy… ANAL  AN    1     "Use… "sub… 67; … assa… chem… dete… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  7 Q       anato… ANAT  AH    1     Used… subh… 66; … anat… anat… hist… morp… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  8 Q       antag… ANTAG AI    1     Used… subh… 68; … anta… anta… inhi… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  9 Q       biosy… BIOS… BI    1     Used… "sub… 66; … anab… biof… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## 10 Q       blood  BLOOD BL    1     "Use… "sub… 67; … <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## # ... with 69 more rows, and 89 more variables: QX.11 <chr>, DA <chr>, MR <chr>, DQ <chr>, UI <chr>, OL <chr>,
## #   TN <chr>, TN.1 <chr>, TN.2 <chr>, TN.3 <chr>, TN.4 <chr>, TN.5 <chr>, TN.6 <chr>, TN.7 <chr>, TN.8 <chr>,
## #   TN.9 <chr>, TN.10 <chr>, TN.11 <chr>, TN.12 <chr>, QS <chr>, TN.13 <chr>, TN.14 <chr>, TN.15 <chr>, TN.16 <chr>,
## #   TN.17 <chr>, TN.18 <chr>, TN.19 <chr>, TN.20 <chr>, TN.21 <chr>, TN.22 <chr>, TN.23 <chr>, TN.24 <chr>,
## #   TN.25 <chr>, TN.26 <chr>, TN.27 <chr>, TN.28 <chr>, TN.29 <chr>, TN.30 <chr>, TN.31 <chr>, TN.32 <chr>,
## #   TN.33 <chr>, TN.34 <chr>, TN.35 <chr>, TN.36 <chr>, TN.37 <chr>, TN.38 <chr>, TN.39 <chr>, TN.40 <chr>,
## #   TN.41 <chr>, TN.42 <chr>, TN.43 <chr>, TN.44 <chr>, TN.45 <chr>, TN.46 <chr>, TN.47 <chr>, TN.48 <chr>,
## #   TN.49 <chr>, TN.50 <chr>, TN.51 <chr>, TN.52 <chr>, TN.53 <chr>, TN.54 <chr>, TN.55 <chr>, TN.56 <chr>,
## #   TN.57 <chr>, TN.58 <chr>, TN.59 <chr>, TN.60 <chr>, TN.61 <chr>, TN.62 <chr>, TN.63 <chr>, TN.64 <chr>,
## #   TN.65 <chr>, TN.66 <chr>, TN.67 <chr>, TN.68 <chr>, TN.69 <chr>, TN.70 <chr>, TN.71 <chr>, TN.72 <chr>,
## #   TN.73 <chr>, TN.74 <chr>, TN.75 <chr>, TN.76 <chr>, TN.77 <chr>, TN.78 <chr>, QX.12 <chr>, QX.13 <chr>, QX.14 <chr>

而且,它也可以与其他两个大文件一起使用(d / l需要一段时间,而解析它们需要一段时间)。

它应该处理MeSH格式的所有已定义重复列。

如果有任何问题请file issues,然后在此处发布。

答案 3 :(得分:-1)

这是一种方法:

library(dplyr)
split1 <- strsplit(imported_data, "\\*NEWRECORD") %>% unlist()
split2 <- split1
if(sum (0==nchar(split1))>0)  split2 <- split1[-which(0==nchar(split1))]

split3 <- strsplit(split2, "\\n")

split4 <- lapply(split3, function(x) strsplit(x, " = "))
split5 <- split4[lapply(split4,length)>0]  #lapply(split4, function(x) if(sum(length(x)==0)>0) {x[-which(length(x)==0)]} else x )

lapply(split4, nchar)
length(split4[[3]][[2]])

is.null( split4[[3]][[1]])

split5 <- lapply(split4, function(x) ifelse(lapply(x, function(y) length(y) >0), x, ""))
split6 <- lapply(split5, function(x) lapply(x, function(y) gsub(" ", "", y)))
split7 <- lapply(split6, function(x) x[nzchar(x)])

lapply(split6, length)

dim1 <- length(split7)
dim2 <- lapply(split7, length) %>% unlist()
dim3 <- lapply(split7, function(x) lapply(x, length) ) %>% unlist()

listres <-  list()

for( i in 1:length(dim2)) {
  mat <- matrix(NA, dim2[i], 2)
  mat <- data.frame(matrix(unlist(split7[[i]]), nrow= dim2[i], byrow=T))# %>% t()
  dim(mat)
  names(mat) <- c("varname", "value")
  mat <- t(mat)
  colnames(mat) <- mat[1,]
  mat <- mat[-1,]
  listres[[i]] <- mat
}
listres

[[1]]
                 SH                  QE                  QA                  QT 
"diagnosticimaging"         "DIAGIMAGE"                "DG"                 "1" 
                 QX                  UI 
        "X-ray|NRW"        "Q000000981" 

[[2]]
              RECTYPE                    SH                    QE 
                  "Q" "analogs&derivatives"             "ANALOGS" 
                   QA                    QT 
                 "AA"                   "1" 

[[3]]
                RECTYPE                      SH                      QE 
                    "Q"         "abnormalities"                "ABNORM" 
                     QX                      QX                      QX 
         "agenesis|NRW"         "anomalies|EQV"           "aplasia|NRW" 
                     QX                      QX                      QX 
          "atresia|NRW"      "birthdefects|NRW" "congenitaldefects|NRW" 
                     QX                      QX                      QX 
          "defects|NRW"       "deformities|NRW"        "hypoplasia|NRW" 
                     UI 
              "Q000002" 

您现在有了一个包含指定n个数据帧的列表。

编辑:您的帖子中的表1使用data.frame(listres[[1]]) %>% t()

找到
                 SH                  QE                  QA                  QT 
"diagnosticimaging"         "DIAGIMAGE"                "DG"                 "1" 
                 QX                  UI 
        "X-ray|NRW"        "Q000000981" 

其他数据帧依此类推。