XPath,选择两个连续的元素并将它们转换为一个数据帧

时间:2016-06-01 10:21:30

标签: r xml xpath elements

我对Path和R一般都是新手,我正在尝试使用XPath将XML文件转换为R中的数据框。在一些帮助下,我设法转换了XML中的大部分信息。但是,现在我尝试将两个连续元素合并为一个数据框。不知怎的,我似乎无法做到正确。

这是xml数据的摘录:

</customer-bootstrap-data>
  <customer-bootstrap-data id="970911" customerName="HighIncome-1_4" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.1124173640233721,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.37606842556525066,-0.0,-0.0,-0.038684343289247636,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.8490012729862713,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data>
  <customer-bootstrap-data id="970912" customerName="HighIncome-2_17" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.21395291779884928,-0.0,-0.0,-1.3581716633726693,-0.0,-0.0,-2.8140822306420716,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0221045637055397,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-3.3,-3.223543705462774,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.153329511039221,-0.0,-0.0,-0.0,-0.0,-0.0,-0.820425411761537,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.7054631085029754,-0.0,-0.7130641168720118,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-2.5003661751788435,-0.0,-0.0,-3.3,-3.3,-0.6606989045692728,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.20818145620010853,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0493154269844851,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.0,-1.041919182358086,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.5334016276259916,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data>

我可以使用以下代码生成包含第一个元素的表:

customerBoot <- xpathSApply(doc=xml, path=("//customer-bootstrap-data"), xmlAttrs)
customerBoot <- data.frame(t(customerBoot))

包含id,customerName和powerType。但是我希望每个客户ID都包含净使用量。

以下代码选择了我想要的所有信息,但它不允许我将其转换为数据框。

customerBoot <- getNodeSet(xml,"//customer-bootstrap-data")

任何想法都可以吗?我正在寻找一个快速的解决方案。

谢谢!

1 个答案:

答案 0 :(得分:1)

让我知道这是否存在速度问题(以这种方式迭代一个巨大的XML文档有时会很慢):

library(XML)
library(purrr)

fil <- '<dat><customer-bootstrap-data id="970911" customerName="HighIncome-1_4" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.1124173640233721,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.37606842556525066,-0.0,-0.0,-0.038684343289247636,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.8490012729862713,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data>
  <customer-bootstrap-data id="970912" customerName="HighIncome-2_17" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.21395291779884928,-0.0,-0.0,-1.3581716633726693,-0.0,-0.0,-2.8140822306420716,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0221045637055397,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-3.3,-3.223543705462774,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.153329511039221,-0.0,-0.0,-0.0,-0.0,-0.0,-0.820425411761537,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.7054631085029754,-0.0,-0.7130641168720118,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-2.5003661751788435,-0.0,-0.0,-3.3,-3.3,-0.6606989045692728,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.20818145620010853,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0493154269844851,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.0,-1.041919182358086,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.5334016276259916,-0.0,-0.0,-0.0</netUsage>
  </customer-bootstrap-data></dat>'

doc <- xmlParse(fil)

customerBoot <- xpathSApply(doc=doc, path="//customer-bootstrap-data", xmlAttrs)
customerBoot <- data.frame(t(customerBoot), stringsAsFactors=FALSE)

# go by row using the id, grab the desired data node, convert it to a wide data.frame

customerBoot <- purrr::by_row(customerBoot, function(x) {

  path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x$id)
  vals <- strsplit(xpathSApply(doc=doc, path=path, xmlValue), ",")[[1]]
  as.numeric(vals)

}, .to="X", .collate="cols")

# limiting the "str()" equivalent "glimpse()" to 15 columns since there are >300 of them:

dplyr::glimpse(customerBoot[, 1:15])

## Observations: 2
## Variables: 15
## $ id           (chr) "970911", "970912"
## $ customerName (chr) "HighIncome-1_4", "HighIncome-2_17"
## $ powerType    (chr) "ELECTRIC_VEHICLE", "ELECTRIC_VEHICLE"
## $ X1           (dbl) 0, 0
## $ X2           (dbl) 0, 0
## $ X3           (dbl) 0, 0
## $ X4           (dbl) 0, 0
## $ X5           (dbl) 0, 0
## $ X6           (dbl) 0, 0
## $ X7           (dbl) 0, 0
## $ X8           (dbl) 0.0000000, -0.2139529
## $ X9           (dbl) 0, 0
## $ X10          (dbl) 0, 0
## $ X11          (dbl) 0.000000, -1.358172
## $ X12          (dbl) 0, 0

xml2的替代方式:

library(purrr)
library(xml2)

doc <- read_xml(fil)

xml_find_all(doc, "//customer-bootstrap-data") %>% 
  xml_attrs() %>% 
  map_df(function(x) {
    path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x["id"])
    vals <- strsplit(xml_text(xml_find_one(doc, path)), ",")[[1]]
    vals <- setNames(as.numeric(vals), sprintf("X%d", 1:length(vals)))
    rbind.data.frame((c(as.list(x), as.list(vals))), stringsAsFactors=FALSE)
  })

更新:

netUsage长度不统一时,这是一种方法:

data.table::rbindlist(apply(customerBoot, 1, function(x) {

  path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x["id"])
  vals <- strsplit(xpathSApply(doc=doc, path=path, xmlValue), ",")[[1]]
  c(as.list(x), as.list(setNames(as.numeric(vals), sprintf("X%d", 1:length(vals)))))

}), fill=TRUE)