我有一个代码来解析与足球比赛的数据源相关的单个xml文件。但是,我有超过300多个游戏的数据,我想将这些代码应用于所有这些提要,因为手工操作会花费时间。我是数据科学的新手,虽然我已经看过其他关于多个XML解析的帖子,但我并不知道如何更改代码以使其适合这种数据结构
library(XML)
library(plyr)
library(gdata)
library(reshape)
f24 <- file.choose() #XML FILE TO BE PARSED
grabAll <- function(XML.parsed, field){
parse.field <- xpathSApply(XML.parsed, paste("//", field, "[@*]", sep=""))
results <- t(sapply(parse.field, function(x) xmlAttrs(x)))
if(typeof(results)=="list"){
do.call(rbind.fill, lapply(lapply(results, t), data.frame,
stringsAsFactors=F))
} else {
as.data.frame(results, stringsAsFactors=F)
}
}
#Play-by-Play Parsing
pbpParse <- xmlInternalTreeParse(f24)
eventInfo <- grabAll(pbpParse, "Event")
eventParse <- xpathSApply(pbpParse, "//Event")
NInfo <- sapply(eventParse, function(x) sum(names(xmlChildren(x)) == "Q"))
QInfo <- grabAll(pbpParse, "Q")
EventsExpanded <- as.data.frame(lapply(eventInfo[,1:2], function(x) rep(x, NInfo)), stringsAsFactors=F)
QInfo <- cbind(EventsExpanded, QInfo)
names(QInfo)[c(1,3)] <- c("Eid", "Qid")
QInfo$value <- ifelse(is.na(QInfo$value), 1, QInfo$value)
Qual <- cast(QInfo, Eid ~ qualifier_id)
#FINAL DATA FOR ONE GAME
events <- merge(eventInfo, Qual, by.x="id", by.y="Eid", all.x=T, suffixes=c("", "Q"))