R xml树到数据帧

时间:2016-03-27 12:41:48

标签: xml r xpath xml-parsing

我有以下XML tree

library("XML")
library("RCurl")
url <- "https://doc-0s-9c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/rk8a2gr7rl8e8s8j0luiak0cahtcjnak/1459080000000/07495711428163271540/*/0BzmnaOABaMIgTEl6SnRUdU9Eb2M?e=download"
bin <- getURL(url)
con <- file("reference.xml", open = "wb")
writeBin(bin, con)
close(con)
OperationList <- xmlTreeParse("reference.xml", useInternal = TRUE)

我可以为计划名称获取一个数据框,为操作名称获取一个数据框。

planname <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/plans/PlanHeader/name"], xmlValue))
operationanme <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/name"], xmlValue))

但是将它们放在一个df中(即展平xml树)不起作用。

我经历了多种方法(比如下面我所尝试的和我得到的错误消息)但到目前为止没有任何工作。谢谢你指出我犯的错误。

xmlToDataFrame函数

Operation.df1 <-  xmlToDataFrame(OperationList)

列的重复下标

xmlToDF函数

根据https://hopstat.wordpress.com/2014/01/14/faster-xml-conversion-to-data-frames/

require(XML)
xmlToDF = function(doc, xpath, isXML = TRUE, usewhich = TRUE, verbose = TRUE) {

  if (!isXML) 
    doc = xmlParse(doc)
  #### get the records for that form
  nodeset <- getNodeSet(doc, xpath)

  ## get the field names
  var.names <- lapply(nodeset, names)

  ## get the total fields that are in any record
  fields = unique(unlist(var.names))

  ## extract the values from all fields
  dl = lapply(fields, function(x) {
    if (verbose) 
    print(paste0("  ", x))
    xpathSApply(proc, paste0(xpath, "/", x), xmlValue)
  })

  ## make logical matrix whether each record had that field
  name.mat = t(sapply(var.names, function(x) fields %in% x))
  df = data.frame(matrix(NA, nrow = nrow(name.mat), ncol = ncol(name.mat)))
  names(df) = fields

  ## fill in that data.frame
  for (icol in 1:ncol(name.mat)) {
    rep.rows = name.mat[, icol]
    if (usewhich) 
    rep.rows = which(rep.rows)
    df[rep.rows, icol] = dl[[icol]]
  }

  return(df)
}

Operation.df2 <- xmlToDF(OperationList,
  xpath = "/subgroups/OperationGroup/subgroups/OperationGroup/name")

name.mat [,icol]中的错误:下标越界

rbind&amp; xpathApply

require(XML)

Operation.df3 <- xpathApply(OperationList,
  "/subgroups/OperationGroup/subgroups/OperationGroup/name",
  function(node) {
  region <- xmlValue(node[["name"]])
  xp <- "./operations/OperationHeader/name"
  operation <- xpathSApply(node, xp, xmlValue)
  if (is.null(operation)) operation <- NA
  data.frame(region, operation, stringsAsFactors = FALSE)
})
do.call(rbind, Operation.df3  )

给出一个NULL

xmlToList和plyr

需要(XML)   需要(plyr)   OperationList2&lt; - xmlToList(OperationList)   Operation.df4&lt; - ldply(OperationList2,data.frame)

*给我的参数意味着不同的行数:1,0

xmlToList,plyr和data.table

require(data.table)
Operation.df41 <- data.frame(rbindlist(OperationList2))

列表输入的第1项不是data.frame,data.table或list

Operation.df42 <-  rbindlist(OperationList2)

列表输入的第1项不是data.frame,data.table或list

Operation.df43 <- data.frame(matrix(unlist(OperationList2),
      byrow=T),stringsAsFactors=FALSE) 

只有一列

Operation.df44 <- lapply(OperationList2, data.frame,
  stringsAsFactors = FALSE) 

参数意味着行数不同:1,0

Operation.df45 <- rbind.fill(Operation.df44)

在循环中使用函数

Convert (possibly malformed) xml into Data Frame in R

xp <- function (OperationList, tag){
  n <- xpathSApply(OperationList, tag, xmlValue)
  if (length(n) > 0) 
    # paste multiple values?  
    paste0(n, collapse="; ") 
  else NA
}

z <- getNodeSet(OperationList, "//subgroups/OperationGroup/subgroups/OperationGroup")
n <-length(z)
notices <-vector("list",n)
for(i in 1:n)
{
  Operation.df5<-xmlDoc(z[[i]])
  Operation.df5[[i]] <- data.frame(
    region = xp(z2, "//name"),
    operation = xp(z2, "//operations/OperationHeader/name"),
    stringsAsFactors=FALSE)
  free(Operation.df5)  
}
do.call("rbind", Operation.df5)

类型'externalptr'的对象不是可子集化的

设置getNodeSet

for (i in 1:length(getNodeSet(OperationList, "//subgroups/OperationGroup"))) 
{
  if (i==1) {
    foo<-xmlSApply(OperationList[[i]], xmlValue)
    Operation.df6 <-data.frame(t(foo), stringsAsFactors=FALSE)
  }
  else {
    foo<-xmlSApply(OperationList[[i]], xmlValue)
    tmp<-data.frame(t(foo), stringsAsFactors=FALSE)
    Operation.df6 <-rbind(Operation.df6, tmp)
  }
}

没有使用整数

对XMLInternalDocument进行子集化的方法

请帮忙!缺少什么?

1 个答案:

答案 0 :(得分:3)

为每个OperationHeader获取其名称(opName)和所有计划的名称(计划),为每个OperationHeader创建一个组件的列表。最后rbind组件在一起:

L <- xpathSApply(OperationList, "//OperationHeader", function(x) 
           cbind(opName = xmlValue(x[["name"]]),
                 plan = xpathSApply(x, "plans/PlanHeader/name", xmlValue) 
           )
     )

do.call("rbind", L)

,并提供:

     opName    plan         
[1,] "State A" "Target Plan"
[2,] "State A" "Revision"   
[3,] "State B" "Target Plan"
[4,] "Avgh"    "Target Plan"
[5,] "Alaska"  "Target Plan"
[6,] "Alaska"  "Revision"