我正在尝试从R中的数据框写一个XML文件,但我遇到了性能问题。
我有以下代码来编写XML文件,它适用于我的示例中的小数据帧。但是,我的真实数据框包含超过50,000行和5列。这需要超过10小时来处理。
如何改善表现?
require(XML)
products <- c('A','B','C')
location <- c(1,2,3)
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
for (i in 3:4) {
element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df[j,i]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
}
}
saveXML(data,file="test.xml")
答案 0 :(得分:2)
这里有几个让你失望的问题。首先,你的嵌套循环可能对你没有多大帮助。您可以通过重塑数据框来摆脱这种情况:
require(XML)
require(reshape2)
products <- c('A','B','C')
location <- c(1,2,3)
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)
df2 <- melt(df, id.vars = c("products", "location"))
df2[,sapply(df2, is.factor)] <- lapply(df2[,sapply(df2, is.factor)], as.character)
df2
products location variable value
1 A 1 var1 1
2 B 2 var1 2
3 C 3 var1 3
4 A 1 var2 1
5 B 2 var2 2
6 C 3 var2 3
这样,您希望包含在XML中的每个指标都在其自己的单独列中排列。
这将产生以下用于构建XML树的方法(包含在稍后进行基准测试的函数中):
xml2 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df2)) {
element = newXMLNode("element",attrs=c(guid=paste(df2$variable[j],df2$products[j],df2$location[j],sep="_")),parent=data2)
name = newXMLNode("name", paste(df2$variable[j],df2$products[j],df2$location[j],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df2$value[j]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df2$variable[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df2$products[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df2$location[j]),parent=ref)
}
data2
}
除此之外,您还在不必要地调用as.character
函数,并在您可以事先调用数据框中的函数时反复调用round
和paste
函数:
df3 <- df2
df3$element <- paste(df3$variable,df3$products,df3$location, sep="_")
df3$name <- paste(df3$variable,df3$products,df3$location, sep=" ")
df3$value <- round(df3$value)
这导致:
xml3 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2)
name = newXMLNode("name", df3$name[j], parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", df3$value[j],parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df3$variable[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df3$products[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df3$location[j]),parent=ref)
}
data2
}
最后,您可以在newXMLNode
:
xml4 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
.children =
list(newXMLNode("name", df3$name[j]),
newXMLNode("value", attrs=c(period="year", unit="Pure"),
.children = list(newXMLNode("orig", df3$value[j]),
newXMLNode("processed")))))
meta = newXMLNode("meta",parent=element,
.children = list(
newXMLNode("ref", attrs=c('source-guid'="fs_items"),
.children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
newXMLNode("ref", attrs=c('source-guid'="products"),
.children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
newXMLNode("ref", attrs=c('source-guid'="location"),
.children = newXMLNode("value", attrs=c(guid=df3$location[j])))))
}
data2
}
所以,如果我们采用你原来的过程:
xml1 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
for (i in 3:4) {
element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df[j,i]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
}
}
data2
}
并对其进行基准测试:
microbenchmark(xml1(), xml2(), xml3(), xml4())
Unit: milliseconds
expr min lq median uq max neval
xml1() 100.43712 100.97356 101.52694 102.28243 367.6518 100
xml2() 99.38772 100.02676 100.63210 101.19588 373.8043 100
xml3() 98.91923 99.67163 100.22482 100.92313 394.2360 100
xml4() 82.09688 82.60983 83.02559 83.64807 367.6711 100
重新整形数据框并在该数据框上调用一次函数而不是在for循环中重复执行两者都有助于(非常)一点点,但是你真正的节省时间将是在调用期间将子节点分配给父级。 newXMLNode
。它仍然不会很快,但它应该比你正在做的更快。
修改强>
如果您需要更高的速度,可以稍微折叠您的节点创建(在第一次调用newXMLnode时将“meta”指定为“element”的子节点):
xml5 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
.children =
list(newXMLNode("name", df3$name[j]),
newXMLNode("value", attrs=c(period="year", unit="Pure"),
.children = list(newXMLNode("orig", df3$value[j]),
newXMLNode("processed"))),
newXMLNode("meta",
.children = list(
newXMLNode("ref", attrs=c('source-guid'="fs_items"),
.children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
newXMLNode("ref", attrs=c('source-guid'="products"),
.children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
newXMLNode("ref", attrs=c('source-guid'="location"),
.children = newXMLNode("value", attrs=c(guid=df3$location[j])))))))
}
data2
}
然而,除此之外,您可能需要重新评估您选择构建XML文档本身的方式。例如,如果在“ref”节点中包含“value”节点作为实际值(现在“ref”节点只有没有节点值的属性),则可以为循环的每次迭代消除对newXMLNode的三次调用:
xml6 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
.children =
list(newXMLNode("name", df3$name[j]),
newXMLNode("value", attrs=c(period="year", unit="Pure"),
.children = list(newXMLNode("orig", df3$value[j]),
newXMLNode("processed"))),
newXMLNode("meta",
.children = list(
newXMLNode("ref", df3$variable[j], attrs=c('source-guid'="fs_items")),
newXMLNode("ref", df3$products[j], attrs=c('source-guid'="products")),
newXMLNode("ref", df3$location[j], attrs=c('source-guid'="location"))
))))
}
data2
}
简化XML文档的结构可以提高速度:
microbenchmark(xml1(), xml2(), xml3(), xml4(), xml5(), xml6())
Unit: milliseconds
expr min lq median uq max neval
xml1() 99.66528 100.79417 101.09906 101.56140 393.4303 100
xml2() 98.58393 99.68279 99.90569 100.64327 392.6561 100
xml3() 98.26595 99.41217 99.65450 100.37495 363.4646 100
xml4() 81.32157 82.33324 82.62350 82.96958 363.4569 100
xml5() 78.89286 79.96670 80.14763 80.74278 346.1388 100
xml6() 71.17018 72.05212 72.36548 72.81261 334.9638 100
这仍然不会将运行时间从几小时缩短到几分钟。如果你真的需要快速运行的东西,我会选择R之外的其他东西来更快地处理循环。