在R中缺少标签的情况下解析以属性为中心的XML

时间:2019-06-20 04:32:54

标签: r xml

我有一个像这样的XML文件

<deviations-hour date-hour="2019052403" i-point="1.12291" price-up="1.12291" price-down="1.06395">
    <deviations deviation-type="46" deviation-price="1.12290" deviation-power="138.000">
    </deviations>
</deviations-hour>
<deviations-hour date-hour="2019052404" i-point="1.16397" price-up="1.16397" price-down="1.10773">
    <deviations deviation-type="45" deviation-price="1.10774" deviation-power="-685.000">
    </deviations>
</deviations-hour>
    <deviations-hour date-hour="2019052405" i-point="1.32412" price-up="1.32412" price-down="1.12999">
</deviations-hour>
<deviations-hour date-hour="2019052406" i-point="1.18508" price-up="1.18508" price-down="1.14266">
    <deviations deviation-type="45" deviation-price="1.14266" deviation-power="-214.000">
    </deviations>
</deviations-hour>

我试图将文件读入数据框。

library("XML")
doc <- xmlParse(xmlfile)
partA <- XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//deviations-hour'))
partB <- XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//deviations'))

xmlAttrsToDataFrame工作正常。但是.. deviations标签的某些行在XML文件中丢失,但是数据框partB不包含NA。该功能仅跳过这些行。 最终,partB包含的行数与partA相同,因此我无法正确合并它们。

1 个答案:

答案 0 :(得分:0)

这是第一次使用xml2打包程序和tidyverse

我加入了很多评论,以阐明所采取的每个步骤。

library( xml2 )
library( tidyverse )

#read xml
doc <- xml2::read_xml( "./test.xml" )

#get the deviation-hour/deviation nodes
devHr.nodes <- xml2::xml_find_all( doc, ".//deviations-hour")
dev.nodes   <- xml2::xml_find_all( doc, ".//deviations")

#create vector all (unique) attrubutes inside deviations-hour/deviation nodes
att.devHr <- xml2::xml_attrs(devHr.nodes) %>% lapply( names ) %>% unlist() %>% unique()
att.dev   <- xml2::xml_attrs(dev.nodes)   %>% lapply( names ) %>% unlist() %>% unique()

#build part A
partA <- 
  #loop over the attributes, extract attribute-values from all devHr.nodes
  lapply( att.devHr, function(x) devHr.nodes %>% xml_attr(x) ) %>% 
  #set column names
  set_names( att.devHr ) %>% 
  #convert to data.frame
  as.data.frame()

#build part B
partB <- 
  #loop over the attributes, extract attribute-values from all devHr.nodes
  lapply( att.dev, function(x) dev.nodes %>% xml_attr(x) ) %>% 
  #set column names
  set_names( att.dev ) %>% 
  #convert to data.frame
  as.data.frame() %>%
  #find the parent node's date-time attribute, and extract it's value
  mutate( date.hour = xml_find_first( dev.nodes , ".//parent::deviations-hour") %>% xml_attr("date-hour") )

#left join on the timestamp
left_join( partA, partB, by = "date.hour")

输出

#    date.hour i.point price.up price.down deviation.type deviation.price deviation.power
# 1 2019052403 1.12291  1.12291    1.06395             46         1.12290         138.000
# 2 2019052404 1.16397  1.16397    1.10773             45         1.10774        -685.000
# 3 2019052405 1.32412  1.32412    1.12999           <NA>            <NA>            <NA>
# 4 2019052406 1.18508  1.18508    1.14266             45         1.14266        -214.000

样本数据“ ./test.xml”

<?xml version="1.0" encoding="UTF-8"?>
<data>
    <deviations-hour date-hour="2019052403" i-point="1.12291" price-up="1.12291" price-down="1.06395">
        <deviations deviation-type="46" deviation-price="1.12290" deviation-power="138.000">
        </deviations>
    </deviations-hour>
    <deviations-hour date-hour="2019052404" i-point="1.16397" price-up="1.16397" price-down="1.10773">
        <deviations deviation-type="45" deviation-price="1.10774" deviation-power="-685.000">
        </deviations>
    </deviations-hour>
        <deviations-hour date-hour="2019052405" i-point="1.32412" price-up="1.32412" price-down="1.12999">
    </deviations-hour>
    <deviations-hour date-hour="2019052406" i-point="1.18508" price-up="1.18508" price-down="1.14266">
        <deviations deviation-type="45" deviation-price="1.14266" deviation-power="-214.000">
        </deviations>
    </deviations-hour>
</data>