我有一个像这样的XML文件
<deviations-hour date-hour="2019052403" i-point="1.12291" price-up="1.12291" price-down="1.06395">
<deviations deviation-type="46" deviation-price="1.12290" deviation-power="138.000">
</deviations>
</deviations-hour>
<deviations-hour date-hour="2019052404" i-point="1.16397" price-up="1.16397" price-down="1.10773">
<deviations deviation-type="45" deviation-price="1.10774" deviation-power="-685.000">
</deviations>
</deviations-hour>
<deviations-hour date-hour="2019052405" i-point="1.32412" price-up="1.32412" price-down="1.12999">
</deviations-hour>
<deviations-hour date-hour="2019052406" i-point="1.18508" price-up="1.18508" price-down="1.14266">
<deviations deviation-type="45" deviation-price="1.14266" deviation-power="-214.000">
</deviations>
</deviations-hour>
我试图将文件读入数据框。
library("XML")
doc <- xmlParse(xmlfile)
partA <- XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//deviations-hour'))
partB <- XML:::xmlAttrsToDataFrame(getNodeSet(doc, path='//deviations'))
xmlAttrsToDataFrame
工作正常。但是.. deviations
标签的某些行在XML文件中丢失,但是数据框partB不包含NA。该功能仅跳过这些行。
最终,partB包含的行数与partA相同,因此我无法正确合并它们。
答案 0 :(得分:0)
这是第一次使用xml2
打包程序和tidyverse
。
我加入了很多评论,以阐明所采取的每个步骤。
library( xml2 )
library( tidyverse )
#read xml
doc <- xml2::read_xml( "./test.xml" )
#get the deviation-hour/deviation nodes
devHr.nodes <- xml2::xml_find_all( doc, ".//deviations-hour")
dev.nodes <- xml2::xml_find_all( doc, ".//deviations")
#create vector all (unique) attrubutes inside deviations-hour/deviation nodes
att.devHr <- xml2::xml_attrs(devHr.nodes) %>% lapply( names ) %>% unlist() %>% unique()
att.dev <- xml2::xml_attrs(dev.nodes) %>% lapply( names ) %>% unlist() %>% unique()
#build part A
partA <-
#loop over the attributes, extract attribute-values from all devHr.nodes
lapply( att.devHr, function(x) devHr.nodes %>% xml_attr(x) ) %>%
#set column names
set_names( att.devHr ) %>%
#convert to data.frame
as.data.frame()
#build part B
partB <-
#loop over the attributes, extract attribute-values from all devHr.nodes
lapply( att.dev, function(x) dev.nodes %>% xml_attr(x) ) %>%
#set column names
set_names( att.dev ) %>%
#convert to data.frame
as.data.frame() %>%
#find the parent node's date-time attribute, and extract it's value
mutate( date.hour = xml_find_first( dev.nodes , ".//parent::deviations-hour") %>% xml_attr("date-hour") )
#left join on the timestamp
left_join( partA, partB, by = "date.hour")
输出
# date.hour i.point price.up price.down deviation.type deviation.price deviation.power
# 1 2019052403 1.12291 1.12291 1.06395 46 1.12290 138.000
# 2 2019052404 1.16397 1.16397 1.10773 45 1.10774 -685.000
# 3 2019052405 1.32412 1.32412 1.12999 <NA> <NA> <NA>
# 4 2019052406 1.18508 1.18508 1.14266 45 1.14266 -214.000
样本数据“ ./test.xml”
<?xml version="1.0" encoding="UTF-8"?>
<data>
<deviations-hour date-hour="2019052403" i-point="1.12291" price-up="1.12291" price-down="1.06395">
<deviations deviation-type="46" deviation-price="1.12290" deviation-power="138.000">
</deviations>
</deviations-hour>
<deviations-hour date-hour="2019052404" i-point="1.16397" price-up="1.16397" price-down="1.10773">
<deviations deviation-type="45" deviation-price="1.10774" deviation-power="-685.000">
</deviations>
</deviations-hour>
<deviations-hour date-hour="2019052405" i-point="1.32412" price-up="1.32412" price-down="1.12999">
</deviations-hour>
<deviations-hour date-hour="2019052406" i-point="1.18508" price-up="1.18508" price-down="1.14266">
<deviations deviation-type="45" deviation-price="1.14266" deviation-power="-214.000">
</deviations>
</deviations-hour>
</data>