速度效率 - R for loop

时间:2014-12-17 03:23:31

标签: xml r xml-parsing xml-attribute

以下代码用于解析XML,以便将节点,父级,类型等信息提取到数据框中。它适用于小行的XML文件,但是当使用大于25,000行的文件时,需要几分钟的时间来处理。因此,我打算优化代码以便更快地处理。该函数的目的是读取任何XML文件并根据数据框的要求生成数据。

示例XML:

<?xml version="1.0" encoding="UTF-8"?>
<CATALOG>
   <PLANT id="1" required="false">
      <COMMON Source="NLM">Bloodroot</COMMON>
      <BOTANICAL>Aquilegia canadensis</BOTANICAL>
      <DATE>
         <Year>2013</Year>
      </DATE>
   </PLANT>
   <PLANT id="2" required="true">
      <COMMON Source="LNP">Columbine</COMMON>
      <BOTANICAL>Aquilegia canadensis</BOTANICAL>
      <DATE>
         <Year>2014</Year>
      </DATE>
   </PLANT>
</CATALOG>

输出:

                      path      node                value  parent      type
1                  CATALOG   CATALOG                 NULL    NULL   element
2            CATALOG/PLANT     PLANT                 NULL CATALOG   element
3            CATALOG/PLANT        id                    1   PLANT attribute
4            CATALOG/PLANT  required                false   PLANT attribute
5     CATALOG/PLANT/COMMON    COMMON            Bloodroot   PLANT      text
6     CATALOG/PLANT/COMMON    Source                  NLM  COMMON attribute
7  CATALOG/PLANT/BOTANICAL BOTANICAL Aquilegia canadensis   PLANT      text
8       CATALOG/PLANT/DATE      DATE                 NULL   PLANT   element
9  CATALOG/PLANT/DATE/Year      Year                 2013    DATE      text
10           CATALOG/PLANT     PLANT                 NULL CATALOG   element
11           CATALOG/PLANT        id                    2   PLANT attribute
12           CATALOG/PLANT  required                 true   PLANT attribute
13    CATALOG/PLANT/COMMON    COMMON            Columbine   PLANT      text
14    CATALOG/PLANT/COMMON    Source                  LNP  COMMON attribute
15 CATALOG/PLANT/BOTANICAL BOTANICAL Aquilegia canadensis   PLANT      text
16      CATALOG/PLANT/DATE      DATE                 NULL   PLANT   element
17 CATALOG/PLANT/DATE/Year      Year                 2014    DATE      text

代码段:

library(XML)
library(plyr)

## helper function of xPathApply
getValues <- function(x) {
  List <- list()

  # find all ancestors of a given node
  ancestorNames <- character()  
  ancestorNamesList <- xmlAncestors(x, fun = function(y) {
    ancestorNames <- c(ancestorNames, xmlName(y))})  
  pathName <- paste(ancestorNamesList, collapse = "/")

  # find the parent of a given node
  parentNode <- xmlParent(x)
  parentName <- "NULL"
  if(!is.null(parentNode)) {
    parentName <- xmlName(parentNode)
  } 

  if(inherits(x, "XMLInternalElementNode")) {
    # check if the value of the given node exists i.e. text
    if(length(xmlValue(x, recursive=FALSE)) != 0) {
      List <- append(List, list(path = pathName, node = xmlName(x), value = xmlValue(x, recursive=FALSE), parent = parentName, type = "text"))
    } else {
      List <- append(List, list(path = pathName, node = xmlName(x), value = "NULL", parent = parentName, type = "element"))      
    }
  }

  ## attributes
  if(!is.null(xmlAttrs(x))) {
    num.attributes = xmlSize(xmlAttrs(x))
    for (i in seq_len(num.attributes)) {
      # get the attribute name
      attributeName <- names(xmlAttrs(x)[i])
      # get the attribute value
      attributeValue <- xmlAttrs(x)[[i]]  

      List <- append(List, list(path = pathName, node = attributeName, value = attributeValue, parent = parentName, type = "attribute"))      
    }
  }

  return(List)
}

## recursive function 
visitNode <- function(node, xpath) {
  if (is.null(node)) {
    return()
  }

  # number of children of a node
  num.children <- xmlSize(node)

  bypass <- function(n = num.children) {
    if(num.children == 0) {
      xpathSApply(node, path = xpath, getValues)
    } else {
      return(num.children)
    }
  }

  # recursive call to visitNode 
  for (i in seq_len(num.children)) { 
    visitNode(node[[i]], xpath) 
  }   

  # add list type result to data frame
  if(is.list(result <- bypass())) {    
    dt <<- do.call(rbind.fill, lapply(result, data.frame)) 
  }
} 


# read XML data from the given file
xtree <- xmlParse("test.xml")

# retrieve the root of the XML
root <- xmlRoot(xtree)

# define data frame which is to hold the data interpreted from XML
dt <- data.frame(path = NA, node = NA, value = NA, parent = NA, type = NA)

# call to recursive function
visitNode(root, xpath <- "//node()")

dt

1 个答案:

答案 0 :(得分:4)

我真的希望有很好的XSLT支持,但我似乎无法找到一个很棒的软件包。另一种策略是将xml转换为更简单的数据文件,您可以使用read.table或其他内容轻松阅读。您可以使用xmlEventParse轻松传递它。这是一个自定义处理程序,似乎可以创建您想要的数据

getHandler<-function(file="", sep=",") {
    list(.startDocument = function(.state) {
           cat("path","node","value","parent","type", file=file, sep=sep)
           cat("\n", file=file, sep=sep, append=T)
           .state
    }, .startElement=function(name, atts, .state) {
       .state$path <- c(.state$path, name)
       cat(paste(.state$path, collapse="/"), name, NA, .state$path[length(.state$path)-1], "element", sep=sep, file=file, append=T)
       cat("\n",  file=file, append=T)
       if(!is.null(atts)) {
           cat(paste(paste(.state$path, collapse="/"), names(atts), atts, .state$path[length(.state$path)-1], "attribute", sep=sep, collapse="\n"), file=file, append=T)
           cat("\n",file=file, append=T)
       }
       .state
    }, .endElement=function(name, .state) {
       .state$path <- .state$path[-length(.state$path)]
       .state
    }, .text=function(value, .state) {
       value <- gsub("^\\s+|\\s+$", "", value)
       if(nchar(value)>0) {
           cat(paste(.state$path, collapse="/"), .state$path[length(.state$path)], value, .state$path[length(.state$path)-1], "text", sep=sep, file=file, append=T)
           cat("\n", file=file, append=T)
       }
       .state
    })
}

所以它并不完全漂亮,但它基本上只是用cat()构建一个字符串。然后我们可以将它与

一起使用
zz <- xmlEventParse("test.xml",
    handlers = getHandler(), 
    state = list(path=character(0)), useDotNames=TRUE)

这将以逗号分隔值的数据输出到屏幕。要保存到文件,您可以执行

zz <- xmlEventParse("test.xml",
    handlers = getHandler(file="ok.txt", sep="\t"), 
    state = list(path=character(0)), useDotNames=TRUE)

将数据分隔为名为&#34; ok.t​​xt&#34;的文件。然后,您可以使用

读取数据
read.table("ok.txt", sep="\t", header=T)

返回

                      path      node                value  parent      type
1                  CATALOG   CATALOG                 <NA>           element
2            CATALOG/PLANT     PLANT                 <NA> CATALOG   element
3            CATALOG/PLANT        id                    1 CATALOG attribute
4            CATALOG/PLANT  required                false CATALOG attribute
5     CATALOG/PLANT/COMMON    COMMON                 <NA>   PLANT   element
6     CATALOG/PLANT/COMMON    Source                  NLM   PLANT attribute
7     CATALOG/PLANT/COMMON    COMMON            Bloodroot   PLANT      text
8  CATALOG/PLANT/BOTANICAL BOTANICAL                 <NA>   PLANT   element
9  CATALOG/PLANT/BOTANICAL BOTANICAL Aquilegia canadensis   PLANT      text
10      CATALOG/PLANT/DATE      DATE                 <NA>   PLANT   element
11 CATALOG/PLANT/DATE/Year      Year                 <NA>    DATE   element
12 CATALOG/PLANT/DATE/Year      Year                 2013    DATE      text
13           CATALOG/PLANT     PLANT                 <NA> CATALOG   element
14           CATALOG/PLANT        id                    2 CATALOG attribute
15           CATALOG/PLANT  required                 true CATALOG attribute
16    CATALOG/PLANT/COMMON    COMMON                 <NA>   PLANT   element
17    CATALOG/PLANT/COMMON    Source                  LNP   PLANT attribute
18    CATALOG/PLANT/COMMON    COMMON            Columbine   PLANT      text
19 CATALOG/PLANT/BOTANICAL BOTANICAL                 <NA>   PLANT   element
20 CATALOG/PLANT/BOTANICAL BOTANICAL Aquilegia canadensis   PLANT      text
21      CATALOG/PLANT/DATE      DATE                 <NA>   PLANT   element
22 CATALOG/PLANT/DATE/Year      Year                 <NA>    DATE   element
23 CATALOG/PLANT/DATE/Year      Year                 2014    DATE      text

现在您的样本中有更多行,但有些选择规则对我来说并不清楚。

主要思想是xmlEventParsexmlParse更有效,因为它不必加载整个树。此外,通过使用cat()转储到文件,我不必立即担心内存管理(但它并不完全像写入磁盘一样好。)

无论如何,它至少是另一个需要考虑的选择。