R - 编写XML的性能问题

时间:2013-06-10 19:09:05

标签: xml performance r

我正在尝试从R中的数据框写一个XML文件,但我遇到了性能问题。

我有以下代码来编写XML文件,它适用于我的示例中的小数据帧。但是,我的真实数据框包含超过50,000行和5列。这需要超过10小时来处理

如何改善表现?

require(XML)
products <- c('A','B','C')
location <- c(1,2,3) 
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)

data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
for (i in 3:4) {
element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df[j,i]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
 }
}

saveXML(data,file="test.xml")

1 个答案:

答案 0 :(得分:2)

这里有几个让你失望的问题。首先,你的嵌套循环可能对你没有多大帮助。您可以通过重塑数据框来摆脱这种情况:

require(XML)
require(reshape2)

products <- c('A','B','C')
location <- c(1,2,3) 
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)

df2 <- melt(df, id.vars = c("products", "location"))
df2[,sapply(df2, is.factor)] <- lapply(df2[,sapply(df2, is.factor)], as.character)

df2
  products location variable value
1        A        1     var1     1
2        B        2     var1     2
3        C        3     var1     3
4        A        1     var2     1
5        B        2     var2     2
6        C        3     var2     3

这样,您希望包含在XML中的每个指标都在其自己的单独列中排列。

这将产生以下用于构建XML树的方法(包含在稍后进行基准测试的函数中):

xml2 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df2)) {
    element = newXMLNode("element",attrs=c(guid=paste(df2$variable[j],df2$products[j],df2$location[j],sep="_")),parent=data2)
    name = newXMLNode("name", paste(df2$variable[j],df2$products[j],df2$location[j],sep=" "), parent=element)
    value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
    orig = newXMLNode("orig", round(df2$value[j]),parent=value)
    processes = newXMLNode("processed",parent=value)
    meta = newXMLNode("meta",parent=element)
    ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df2$variable[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df2$products[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df2$location[j]),parent=ref)
  }
  data2
}

除此之外,您还在不必要地调用as.character函数,并在您可以事先调用数据框中的函数时反复调用roundpaste函数:

df3 <- df2
df3$element <- paste(df3$variable,df3$products,df3$location, sep="_")
df3$name <- paste(df3$variable,df3$products,df3$location, sep=" ")
df3$value <- round(df3$value)

这导致:

xml3 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2)
    name = newXMLNode("name", df3$name[j], parent=element)
    value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
    orig = newXMLNode("orig", df3$value[j],parent=value)
    processes = newXMLNode("processed",parent=value)
    meta = newXMLNode("meta",parent=element)
    ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df3$variable[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df3$products[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df3$location[j]),parent=ref)
  }
  data2
}

最后,您可以在newXMLNode

的调用中创建子节点
xml4 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
      .children = 
        list(newXMLNode("name", df3$name[j]),
          newXMLNode("value", attrs=c(period="year", unit="Pure"),
            .children = list(newXMLNode("orig", df3$value[j]),
              newXMLNode("processed")))))
    meta = newXMLNode("meta",parent=element,
      .children = list(
        newXMLNode("ref", attrs=c('source-guid'="fs_items"),
          .children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
        newXMLNode("ref", attrs=c('source-guid'="products"),
          .children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
        newXMLNode("ref", attrs=c('source-guid'="location"),
          .children = newXMLNode("value", attrs=c(guid=df3$location[j])))))
  }
  data2
}

所以,如果我们采用你原来的过程:

xml1 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
  for (i in 3:4) {
    element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
    name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
    value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
    orig = newXMLNode("orig", round(df[j,i]),parent=value)
    processes = newXMLNode("processed",parent=value)
    meta = newXMLNode("meta",parent=element)
    ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
  }
}
  data2
}

并对其进行基准测试:

microbenchmark(xml1(), xml2(), xml3(), xml4())
Unit: milliseconds
   expr       min        lq    median        uq      max neval
 xml1() 100.43712 100.97356 101.52694 102.28243 367.6518   100
 xml2()  99.38772 100.02676 100.63210 101.19588 373.8043   100
 xml3()  98.91923  99.67163 100.22482 100.92313 394.2360   100
 xml4()  82.09688  82.60983  83.02559  83.64807 367.6711   100

重新整形数据框并在该数据框上调用一次函数而不是在for循环中重复执行两者都有助于(非常)一点点,但是你真正的节省时间将是在调用期间将子节点分配给父级。 newXMLNode。它仍然不会很快,但它应该比你正在做的更快。

修改

如果您需要更高的速度,可以稍微折叠您的节点创建(在第一次调用newXMLnode时将“meta”指定为“element”的子节点):

xml5 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
      .children = 
        list(newXMLNode("name", df3$name[j]),
          newXMLNode("value", attrs=c(period="year", unit="Pure"),
            .children = list(newXMLNode("orig", df3$value[j]),
              newXMLNode("processed"))),
          newXMLNode("meta",
            .children = list(
              newXMLNode("ref", attrs=c('source-guid'="fs_items"),
                .children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
              newXMLNode("ref", attrs=c('source-guid'="products"),
                .children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
              newXMLNode("ref", attrs=c('source-guid'="location"),
                .children = newXMLNode("value", attrs=c(guid=df3$location[j])))))))
  }
  data2
}

然而,除此之外,您可能需要重新评估您选择构建XML文档本身的方式。例如,如果在“ref”节点中包含“value”节点作为实际值(现在“ref”节点只有没有节点值的属性),则可以为循环的每次迭代消除对newXMLNode的三次调用:

xml6 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
      .children = 
        list(newXMLNode("name", df3$name[j]),
          newXMLNode("value", attrs=c(period="year", unit="Pure"),
            .children = list(newXMLNode("orig", df3$value[j]),
              newXMLNode("processed"))),
          newXMLNode("meta",
            .children = list(
              newXMLNode("ref", df3$variable[j], attrs=c('source-guid'="fs_items")),
              newXMLNode("ref", df3$products[j], attrs=c('source-guid'="products")),
              newXMLNode("ref", df3$location[j], attrs=c('source-guid'="location"))
            ))))
  }
  data2
}

简化XML文档的结构可以提高速度:

microbenchmark(xml1(), xml2(), xml3(), xml4(), xml5(), xml6())

Unit: milliseconds
   expr      min        lq    median        uq      max neval
 xml1() 99.66528 100.79417 101.09906 101.56140 393.4303   100
 xml2() 98.58393  99.68279  99.90569 100.64327 392.6561   100
 xml3() 98.26595  99.41217  99.65450 100.37495 363.4646   100
 xml4() 81.32157  82.33324  82.62350  82.96958 363.4569   100
 xml5() 78.89286  79.96670  80.14763  80.74278 346.1388   100
 xml6() 71.17018  72.05212  72.36548  72.81261 334.9638   100

这仍然不会将运行时间从几小时缩短到几分钟。如果你真的需要快速运行的东西,我会选择R之外的其他东西来更快地处理循环。