我正在尝试使用R中的XML包解析XML文件(超过200,000个文件,800MB)中的内容,并将它们保存到文本文件中以供进一步处理。然而,我的笔记本电脑只有4G RAM,R会话总是在这样做时崩溃。我的代码如下,我试图在rm()之后使用ldply(),rm()和gc()。然而记忆问题仍然存在。有人可以指出我的问题吗?非常感谢你!
# read the file names
file_list = list.files()
parseXml = function(filename) {
data = xmlTreeParse(filename, useInternalNodes = T)
for (i in 1:length(xpathApply(data, "//mesh_term", xmlValue)) ) {
tmp = data.frame("nct_id" = character(), "mesh_term" = character(),
stringsAsFactors = F)
# skip those trials data without mesh_term
if (length(xpathApply(data, "//mesh_term", xmlValue)) > 0) {
tmp[1, 1] = xpathApply(data, "//nct_id", xmlValue)[[1]]
tmp[1, 2] = xpathApply(data, "//mesh_term", xmlValue)[[i]]
}
}
return(tmp)
rm(tmp)
gc()
}
# chop file_list into 1000 sections and do
# 1000 iteration, I assume that this can save some memory (but useless)
n = 1000
for (i in 1:n) {
trialMesh = ldply(file_list[ (length(file_list)/n * (i-1) + 1) : (length(file_list)/n * i) ],
parseXml)
write.table(trialMesh, paste0("mypath/trialMesh_", i, ".txt"), sep="|",
eol="\n", quote=F, row.names = F, col.names = T)
rm(trialMesh)
gc()
}