R将xml复制到数据框

时间:2018-05-23 13:25:19

标签: r xml xml-parsing

我正在寻找一种方法将一个高度复杂的xml文件(太长,所以它的底部)转换为一个表,从官方Property注册表中获取并存储大约20.000个建筑物

每个“consulta_dnp”(每个建筑物)的结果必须是一行,这些数据在列中:

<pc1><pc2><car><cc1><cc2><np><nm><luso><sfc><cpt><ant>

另一个问题是无法检索数据时出错。它以这种方式存储:

<consulta_dnp>
  <control>
    <cuerr>1</cuerr>
  </control>
  <lerr>
    <err>
      <cod>4</cod>
      <des>error description</des>
    </err>
  </lerr>
</consulta_dnp>

我对错误代码不感兴趣,我只想要一个空行,“错误”或其他内容。

我一直在处理silimar问题的答案,但我没有运气。

这是我使用的代码

doc <- xmlParse("resultado_JA-.txt")

xml_len <- length(getNodeSet(doc,"//consulta_dnp"))

dflist <- lapply(seq(xml_len), function(i){   
  # PARENT NODES   
  d1 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/"))), key=1)
  # CHILD NODES
  d2 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1) 
  d3 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc2"))), key=1) 
  d4 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1) 
  d5 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/car"))), key=1) 
  d6 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc1"))), key=1) 
  d7 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc2"))), key=1) 
  d8 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/np"))), key=1) 
  d9 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/nm"))), key=1) 
  d10 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ldt"))), key=1) 
  d11 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/luso"))), key=1) 
  d12 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/sfc"))), key=1) 
  d13 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/cpt"))), key=1) 
  d14 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/ant"))), key=1) 

  # MERGE ON KEY, THEN DROP KEY      
  merge(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, by="key")[-1]    
})

xmldf_JA <- do.call(rbind, dflist)

此代码计算“consulta_dnp”的正确出现次数,但总是卡在此:

  aXPath error : Invalid expression
XPath error : Invalid expression
 Error in xpathApply.XMLInternalDocument(doc, path, fun, ..., namespaces = namespaces,  : 
  error evaluating xpath expression //consulta_dnp[1]/ 

任何帮助都会受到赞赏。

这是代码(不是真实的数据,但它是真实的结构)

<Doc>
 <consulta_dnp>
  <control>
    <cudnp>1</cudnp>
    <cucons>1</cucons>
    <cucul>0</cucul>
  </control>
  <bico>
    <bi>
      <idbi>
        <cn>UR</cn>
        <rc>
          <pc1>0499418</pc1>
          <pc2>VG3709N</pc2>
          <car>0008</car>
          <cc1>R</cc1>
          <cc2>E</cc2>
        </rc>
      </idbi>
      <dt>
        <loine>
          <cp>23</cp>
          <cm>50</cm>
        </loine>
        <cmc>900</cmc>
        <np>VILLACONEJOS DE ARRIBA</np>
        <nm>MALAGA</nm>
        <locs>
          <lous>
            <lourb>
              <dir>
                <cv>799</cv>
                <tv>CL</tv>
                <nv>calle</nv>
                <pnp>2</pnp>
                <snp>0</snp>
              </dir>
              <loint>
                <es>1</es>
                <pt>01</pt>
                <pu>B</pu>
              </loint>
              <dp>29005</dp>
              <dm>1</dm>
            </lourb>
          </lous>
        </locs>
      </dt>
      <ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
      <debi>
        <luso>Residencial</luso>
        <sfc>72</sfc>
        <cpt>3,430000</cpt>
        <ant>1979</ant>
      </debi>
    </bi>
    <lcons>
      <cons>
        <lcd>VIVIENDA</lcd>
        <dt>
          <lourb>
            <loint>
              <es>1</es>
              <pt>01</pt>
              <pu>B</pu>
            </loint>
          </lourb>
        </dt>
        <dfcons>
          <stl>72</stl>
        </dfcons>
      </cons>
    </lcons>
  </bico>
</consulta_dnp>
</Doc>

1 个答案:

答案 0 :(得分:0)

library(xml2)
library(tidyverse)

我会尝试使用以下方法:使用xml2读取数据,创建用于提取感兴趣元素的表达式,然后映射这些表达式并将其组合到data.frame。

# the structure of the document (code for data see below)
# I copied the code, so we have one entry, one error, and the first entry repeated
xml
#> {xml_document}
#> <Doc>
#> [1] <consulta_dnp>\n  <control>\n    <cudnp>1</cudnp>\n    <cucons>1</cu ...
#> [2] <consulta_dnp>\n  <control>\n    <cuerr>1</cuerr>\n  </control>\n  < ...
#> [3] <consulta_dnp>\n  <control>\n    <cudnp>1</cudnp>\n    <cucons>1</cu ...

# small helper for extracting the content
extract_child <- function(x, xpath) {
  xml_find_all(x, xpath) %>% 
    xml_text()
}

# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc", 
                       "cpt", "ant")


xpath_expressions %>% 
  paste0(".//", .) %>% # search for the expressions from root
  map(~extract_child(xml, .x)) %>% 
  set_names(xpath_expressions) %>% 
  dplyr::bind_rows() %>% 
  type_convert(locale = locale(decimal_mark = ",")) 
#> # A tibble: 2 x 11
#>   pc1     pc2     car   cc1   cc2   np       nm    luso    sfc   cpt   ant
#>   <chr>   <chr>   <chr> <chr> <chr> <chr>    <chr> <chr> <int> <dbl> <int>
#> 1 0499418 VG3709N 0008  R     E     VILLACO… MALA… Resi…    72  3.43  1979
#> 2 0499418 VG3709N 0008  R     E     VILLACO… MALA… Resi…    72  3.43  1979

这种方法“神奇地”起作用并且错误没有问题,因为只有我们感兴趣的那些部分被提取出来,并且在有错误的情况和没有错误的情况之间没有重叠。如果您有条目,某些字段丢失但其他字段存在,则需要调整代码。详细说明:当缺少整个标签时,这种方法会中断。如果所有代码都存在但没有内容(例如<ant></ant>),则会产生正确的NA

更新

以下代码即使在缺少元素时也能正常运行,并且应该为您的代码运行。

extract_child <- function(x, xpath) {
  out <- xml_find_all(x, xpath) %>% 
    xml_text()

  if (is_empty(out)) out <- NA_character_

  out
}

# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc", 
                       "cpt", "ant")



extract_part <- function(part) {
  xpath_expressions %>% 
    paste0(".//", .) %>% # search for the expressions from root
    map(~extract_child(part, .x)) %>% 
    set_names(xpath_expressions) %>% 
    keep(~any(!is.na(.))) %>% 
    dplyr::bind_rows() %>% 
    type_convert(locale = locale(decimal_mark = ",")) 
}


xml %>% 
  xml_children() %>% 
  map_df(extract_part)

数据

   xml <- read_xml("<Doc>
     <consulta_dnp>
    <control>
    <cudnp>1</cudnp>
    <cucons>1</cucons>
    <cucul>0</cucul>
    </control>
    <bico>
    <bi>
    <idbi>
    <cn>UR</cn>
    <rc>
    <pc1>0499418</pc1>
    <pc2>VG3709N</pc2>
    <car>0008</car>
    <cc1>R</cc1>
    <cc2>E</cc2>
    </rc>
    </idbi>
    <dt>
    <loine>
    <cp>23</cp>
    <cm>50</cm>
    </loine>
    <cmc>900</cmc>
    <np>VILLACONEJOS DE ARRIBA</np>
    <nm>MALAGA</nm>
    <locs>
    <lous>
    <lourb>
    <dir>
    <cv>799</cv>
    <tv>CL</tv>
    <nv>calle</nv>
    <pnp>2</pnp>
    <snp>0</snp>
    </dir>
    <loint>
    <es>1</es>
    <pt>01</pt>
    <pu>B</pu>
    </loint>
    <dp>29005</dp>
    <dm>1</dm>
    </lourb>
    </lous>
    </locs>
    </dt>
    <ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
    <debi>
    <luso>Residencial</luso>
    <sfc>72</sfc>
    <cpt>3,430000</cpt>
    <ant>1979</ant>
    </debi>
    </bi>
    <lcons>
    <cons>
    <lcd>VIVIENDA</lcd>
    <dt>
    <lourb>
    <loint>
    <es>1</es>
    <pt>01</pt>
    <pu>B</pu>
    </loint>
    </lourb>
    </dt>
    <dfcons>
    <stl>72</stl>
    </dfcons>
    </cons>
    </lcons>
    </bico>
    </consulta_dnp>
    <consulta_dnp>
      <control>
                    <cuerr>1</cuerr>
                    </control>
                    <lerr>
                    <err>
                    <cod>4</cod>
                    <des>error description</des>
                    </err>
                    </lerr>
                    </consulta_dnp>
     <consulta_dnp>
    <control>
                    <cudnp>1</cudnp>
                    <cucons>1</cucons>
                    <cucul>0</cucul>
                    </control>
                    <bico>
                    <bi>
                    <idbi>
                    <cn>UR</cn>
                    <rc>
                    <pc1>0499418</pc1>
                    <pc2>VG3709N</pc2>
                    <car>0008</car>
                    <cc1>R</cc1>
                    <cc2>E</cc2>
                    </rc>
                    </idbi>
                    <dt>
                    <loine>
                    <cp>23</cp>
                    <cm>50</cm>
                    </loine>
                    <cmc>900</cmc>
                    <np>VILLACONEJOS DE ARRIBA</np>
                    <nm>MALAGA</nm>
                    <locs>
                    <lous>
                    <lourb>
                    <dir>
                    <cv>799</cv>
                    <tv>CL</tv>
                    <nv>calle</nv>
                    <pnp>2</pnp>
                    <snp>0</snp>
                    </dir>
                    <loint>
                    <es>1</es>
                    <pt>01</pt>
                    <pu>B</pu>
                    </loint>
                    <dp>29005</dp>
                    <dm>1</dm>
                    </lourb>
                    </lous>
                    </locs>
                    </dt>
                    <ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
                    <debi>
                    <luso>Residencial</luso>
                    <sfc>72</sfc>
                    <cpt>3,430000</cpt>
                    <ant>1979</ant>
                    </debi>
                    </bi>
                    <lcons>
                    <cons>
                    <lcd>VIVIENDA</lcd>
                    <dt>
                    <lourb>
                    <loint>
                    <es>1</es>
                    <pt>01</pt>
                    <pu>B</pu>
                    </loint>
                    </lourb>
                    </dt>
                    <dfcons>
                    <stl>72</stl>
                    </dfcons>
                    </cons>
                    </lcons>
                    </bico>
                    </consulta_dnp>
    </Doc>")