我正在寻找一种方法将一个高度复杂的xml文件(太长,所以它的底部)转换为一个表,从官方Property注册表中获取并存储大约20.000个建筑物
每个“consulta_dnp”(每个建筑物)的结果必须是一行,这些数据在列中:
<pc1><pc2><car><cc1><cc2><np><nm><luso><sfc><cpt><ant>
另一个问题是无法检索数据时出错。它以这种方式存储:
<consulta_dnp>
<control>
<cuerr>1</cuerr>
</control>
<lerr>
<err>
<cod>4</cod>
<des>error description</des>
</err>
</lerr>
</consulta_dnp>
我对错误代码不感兴趣,我只想要一个空行,“错误”或其他内容。
我一直在处理silimar问题的答案,但我没有运气。
这是我使用的代码
doc <- xmlParse("resultado_JA-.txt")
xml_len <- length(getNodeSet(doc,"//consulta_dnp"))
dflist <- lapply(seq(xml_len), function(i){
# PARENT NODES
d1 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/"))), key=1)
# CHILD NODES
d2 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1)
d3 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc2"))), key=1)
d4 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/pc1"))), key=1)
d5 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/car"))), key=1)
d6 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc1"))), key=1)
d7 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ibdi/rc/cc2"))), key=1)
d8 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/np"))), key=1)
d9 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/dt/nm"))), key=1)
d10 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/ldt"))), key=1)
d11 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/luso"))), key=1)
d12 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/sfc"))), key=1)
d13 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/cpt"))), key=1)
d14 <- transform(xmlToDataFrame(nodes=getNodeSet(doc, paste0("//consulta_dnp[",i,"]/bico/bi/debi/ant"))), key=1)
# MERGE ON KEY, THEN DROP KEY
merge(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, by="key")[-1]
})
xmldf_JA <- do.call(rbind, dflist)
此代码计算“consulta_dnp”的正确出现次数,但总是卡在此:
aXPath error : Invalid expression
XPath error : Invalid expression
Error in xpathApply.XMLInternalDocument(doc, path, fun, ..., namespaces = namespaces, :
error evaluating xpath expression //consulta_dnp[1]/
任何帮助都会受到赞赏。
这是代码(不是真实的数据,但它是真实的结构)
<Doc>
<consulta_dnp>
<control>
<cudnp>1</cudnp>
<cucons>1</cucons>
<cucul>0</cucul>
</control>
<bico>
<bi>
<idbi>
<cn>UR</cn>
<rc>
<pc1>0499418</pc1>
<pc2>VG3709N</pc2>
<car>0008</car>
<cc1>R</cc1>
<cc2>E</cc2>
</rc>
</idbi>
<dt>
<loine>
<cp>23</cp>
<cm>50</cm>
</loine>
<cmc>900</cmc>
<np>VILLACONEJOS DE ARRIBA</np>
<nm>MALAGA</nm>
<locs>
<lous>
<lourb>
<dir>
<cv>799</cv>
<tv>CL</tv>
<nv>calle</nv>
<pnp>2</pnp>
<snp>0</snp>
</dir>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
<dp>29005</dp>
<dm>1</dm>
</lourb>
</lous>
</locs>
</dt>
<ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
<debi>
<luso>Residencial</luso>
<sfc>72</sfc>
<cpt>3,430000</cpt>
<ant>1979</ant>
</debi>
</bi>
<lcons>
<cons>
<lcd>VIVIENDA</lcd>
<dt>
<lourb>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
</lourb>
</dt>
<dfcons>
<stl>72</stl>
</dfcons>
</cons>
</lcons>
</bico>
</consulta_dnp>
</Doc>
答案 0 :(得分:0)
library(xml2)
library(tidyverse)
我会尝试使用以下方法:使用xml2
读取数据,创建用于提取感兴趣元素的表达式,然后映射这些表达式并将其组合到data.frame。
# the structure of the document (code for data see below)
# I copied the code, so we have one entry, one error, and the first entry repeated
xml
#> {xml_document}
#> <Doc>
#> [1] <consulta_dnp>\n <control>\n <cudnp>1</cudnp>\n <cucons>1</cu ...
#> [2] <consulta_dnp>\n <control>\n <cuerr>1</cuerr>\n </control>\n < ...
#> [3] <consulta_dnp>\n <control>\n <cudnp>1</cudnp>\n <cucons>1</cu ...
# small helper for extracting the content
extract_child <- function(x, xpath) {
xml_find_all(x, xpath) %>%
xml_text()
}
# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc",
"cpt", "ant")
xpath_expressions %>%
paste0(".//", .) %>% # search for the expressions from root
map(~extract_child(xml, .x)) %>%
set_names(xpath_expressions) %>%
dplyr::bind_rows() %>%
type_convert(locale = locale(decimal_mark = ","))
#> # A tibble: 2 x 11
#> pc1 pc2 car cc1 cc2 np nm luso sfc cpt ant
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <dbl> <int>
#> 1 0499418 VG3709N 0008 R E VILLACO… MALA… Resi… 72 3.43 1979
#> 2 0499418 VG3709N 0008 R E VILLACO… MALA… Resi… 72 3.43 1979
这种方法“神奇地”起作用并且错误没有问题,因为只有我们感兴趣的那些部分被提取出来,并且在有错误的情况和没有错误的情况之间没有重叠。如果您有条目,某些字段丢失但其他字段存在,则需要调整代码。详细说明:当缺少整个标签时,这种方法会中断。如果所有代码都存在但没有内容(例如<ant></ant>
),则会产生正确的NA
。
以下代码即使在缺少元素时也能正常运行,并且应该为您的代码运行。
extract_child <- function(x, xpath) {
out <- xml_find_all(x, xpath) %>%
xml_text()
if (is_empty(out)) out <- NA_character_
out
}
# our fields of interest
xpath_expressions <- c("pc1", "pc2", "car", "cc1", "cc2", "np", "nm", "luso", "sfc",
"cpt", "ant")
extract_part <- function(part) {
xpath_expressions %>%
paste0(".//", .) %>% # search for the expressions from root
map(~extract_child(part, .x)) %>%
set_names(xpath_expressions) %>%
keep(~any(!is.na(.))) %>%
dplyr::bind_rows() %>%
type_convert(locale = locale(decimal_mark = ","))
}
xml %>%
xml_children() %>%
map_df(extract_part)
xml <- read_xml("<Doc>
<consulta_dnp>
<control>
<cudnp>1</cudnp>
<cucons>1</cucons>
<cucul>0</cucul>
</control>
<bico>
<bi>
<idbi>
<cn>UR</cn>
<rc>
<pc1>0499418</pc1>
<pc2>VG3709N</pc2>
<car>0008</car>
<cc1>R</cc1>
<cc2>E</cc2>
</rc>
</idbi>
<dt>
<loine>
<cp>23</cp>
<cm>50</cm>
</loine>
<cmc>900</cmc>
<np>VILLACONEJOS DE ARRIBA</np>
<nm>MALAGA</nm>
<locs>
<lous>
<lourb>
<dir>
<cv>799</cv>
<tv>CL</tv>
<nv>calle</nv>
<pnp>2</pnp>
<snp>0</snp>
</dir>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
<dp>29005</dp>
<dm>1</dm>
</lourb>
</lous>
</locs>
</dt>
<ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
<debi>
<luso>Residencial</luso>
<sfc>72</sfc>
<cpt>3,430000</cpt>
<ant>1979</ant>
</debi>
</bi>
<lcons>
<cons>
<lcd>VIVIENDA</lcd>
<dt>
<lourb>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
</lourb>
</dt>
<dfcons>
<stl>72</stl>
</dfcons>
</cons>
</lcons>
</bico>
</consulta_dnp>
<consulta_dnp>
<control>
<cuerr>1</cuerr>
</control>
<lerr>
<err>
<cod>4</cod>
<des>error description</des>
</err>
</lerr>
</consulta_dnp>
<consulta_dnp>
<control>
<cudnp>1</cudnp>
<cucons>1</cucons>
<cucul>0</cucul>
</control>
<bico>
<bi>
<idbi>
<cn>UR</cn>
<rc>
<pc1>0499418</pc1>
<pc2>VG3709N</pc2>
<car>0008</car>
<cc1>R</cc1>
<cc2>E</cc2>
</rc>
</idbi>
<dt>
<loine>
<cp>23</cp>
<cm>50</cm>
</loine>
<cmc>900</cmc>
<np>VILLACONEJOS DE ARRIBA</np>
<nm>MALAGA</nm>
<locs>
<lous>
<lourb>
<dir>
<cv>799</cv>
<tv>CL</tv>
<nv>calle</nv>
<pnp>2</pnp>
<snp>0</snp>
</dir>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
<dp>29005</dp>
<dm>1</dm>
</lourb>
</lous>
</locs>
</dt>
<ldt>CL calle 2 Es:1 Pl:01 Pt:B 29005 Madrid (Madrid)</ldt>
<debi>
<luso>Residencial</luso>
<sfc>72</sfc>
<cpt>3,430000</cpt>
<ant>1979</ant>
</debi>
</bi>
<lcons>
<cons>
<lcd>VIVIENDA</lcd>
<dt>
<lourb>
<loint>
<es>1</es>
<pt>01</pt>
<pu>B</pu>
</loint>
</lourb>
</dt>
<dfcons>
<stl>72</stl>
</dfcons>
</cons>
</lcons>
</bico>
</consulta_dnp>
</Doc>")