Question

我有一个代码来解析与足球比赛的数据源相关的单个xml文件。但是，我有超过300多个游戏的数据，我想将这些代码应用于所有这些提要，因为手工操作会花费时间。我是数据科学的新手，虽然我已经看过其他关于多个XML解析的帖子，但我并不知道如何更改代码以使其适合这种数据结构

library(XML)
library(plyr)
library(gdata)
library(reshape)

f24 <- file.choose() #XML FILE TO BE PARSED

grabAll <- function(XML.parsed, field){ 
   parse.field <- xpathSApply(XML.parsed, paste("//", field, "[@*]", sep=""))
   results <- t(sapply(parse.field, function(x) xmlAttrs(x)))
   if(typeof(results)=="list"){
   do.call(rbind.fill, lapply(lapply(results, t), data.frame,  
   stringsAsFactors=F))
} else {
   as.data.frame(results, stringsAsFactors=F)
}
}

#Play-by-Play Parsing
 pbpParse <- xmlInternalTreeParse(f24)
 eventInfo <- grabAll(pbpParse, "Event")
 eventParse <- xpathSApply(pbpParse, "//Event")
 NInfo <- sapply(eventParse, function(x) sum(names(xmlChildren(x)) == "Q"))
 QInfo <- grabAll(pbpParse, "Q")
 EventsExpanded <- as.data.frame(lapply(eventInfo[,1:2], function(x) rep(x, NInfo)), stringsAsFactors=F)
 QInfo <- cbind(EventsExpanded, QInfo)
 names(QInfo)[c(1,3)] <- c("Eid", "Qid")
 QInfo$value <- ifelse(is.na(QInfo$value), 1, QInfo$value)
 Qual <- cast(QInfo, Eid ~ qualifier_id)

 #FINAL DATA FOR ONE GAME
 events <- merge(eventInfo, Qual, by.x="id", by.y="Eid", all.x=T, suffixes=c("", "Q"))

Example of the data feed

创建函数来解析r

0 个答案: