我正在尝试从xml文件中解析所有操作节点,并添加以下atbat num值。我在下面有一段xml文件。我有代码工作来抓取所有的操作,我只是无法弄清楚如何将以下atbat num添加到列表/数据帧。我相信它是一个兄弟节点,而不是父/子节点,因为这些动作不是嵌套在一个atbat节点下。我正在使用的代码是这个我试图将follow-sibling合并为不等于action,但不仅可以返回错误。它应该从每个动作节点中提取所有值,附加gameid(它已经执行此操作),并附加以下atbat num值。
url <- "http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_20/gid_2015_05_20_tbamlb_atlmlb_1/inning/inning_all.xml"
mlb <- read_xml(url)
# get all at-bats ---------------------------------------------------------
atbat <- try(xml_find_all(mlb, "//action"), silent = FALSE)
bind_rows(lapply(atbat, function(y) {
data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
})) -> pitch_dat
game_id = substr(url, 66, 95)
pitch_dat$game_id <- game_id
batsdf <- data.frame(pitch_dat)
xml示例
<atbat num="44" b="4" s="0" o="2" start_tfs="004231" start_tfs_zulu="2015-05-21T00:42:31Z" batter="452678" stand="R" b_height="6-0" pitcher="501593" p_throws="L" des="Luis Avilan intentionally walks Asdrubal Cabrera. " des_es="Luis Avilan recibe base por bolas intencional Asdrubal Cabrera. " event_num="333" event="Intent Walk" event_es="Base por Bolas Intencional" play_guid="86cdbf1e-049e-4eb6-9314-85cfb3e6e28a" home_team_runs="1" away_team_runs="1">...</atbat>
<action b="4" s="0" o="2" des="Coaching visit to mound. " des_es="Visita del Instructor a la Lomita" event="Game Advisory" event_es="Aviso en el Juego" tfs="004848" tfs_zulu="2015-05-21T00:48:48Z" player="425784" pitch="4" event_num="334" home_team_runs="1" away_team_runs="1"/>
<action b="0" s="0" o="2" des="Pitching Change: Brandon Cunniff replaces Luis Avilan, batting 9th. " des_es="Cambio de Lanzador: Brandon Cunniff reemplaza a Luis Avilan, bateando noveno. " event="Pitching Substitution" event_es="Cambio de Lanzador" tfs="004904" tfs_zulu="2015-05-21T00:49:04Z" player="594792" pitch="4" event_num="336" home_team_runs="1" away_team_runs="1"/>
<action b="0" s="0" o="2" des="Offensive Substitution: Pinch-hitter Nick Franklin replaces Rene Rivera. " des_es="Sustitución a la ofensiva: bateador emergente Nick Franklin reemplaza a Rene Rivera. " event="Offensive Sub" event_es="Cambio Defensivo" tfs="004924" tfs_zulu="2015-05-21T00:49:24Z" player="545338" pitch="4" event_num="338" home_team_runs="1" away_team_runs="1"/>
<atbat num="45" b="0" s="1" o="3" start_tfs="004934" start_tfs_zulu="2015-05-21T00:49:34Z" batter="545338" stand="L" b_height="6-1" pitcher="594792" p_throws="R" des="Nick Franklin grounds out softly to first baseman Freddie Freeman. " des_es="Nick Franklin batea rodado de out suavemente a primera base Freddie Freeman. " event_num="343" event="Groundout" event_es="Roletazo de Out" home_team_runs="1" away_team_runs="1">...</atbat>
我想最终得到的结果是,atbatnum字段填充了用于示例目的的补充数据。
Variables:
$ b (chr) "1", "0", "0", "4", "0", "0", "0", "0", "0", "0", "4", "1", "0", "0", "0", "0", "0"
$ s (chr) "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "2", "2", "0", "0", "0", "0", "0"
$ o (chr) "1", "2", "0", "2", "2", "2", "0", "0", "0", "1", "2", "2", "0", "0", "0", "0", "2"
$ des (chr) "Coaching visit to mound. ", "Offensive Substitution: Pinch-hitter Adonis Garcia replaces Williams Perez. ", "Pitchi...
$ des_es (chr) "Visita del Instructor a la Lomita", "Sustitución a la ofensiva: bateador emergente Adonis Garcia reemplaza a Williams...
$ event (chr) "Game Advisory", "Offensive Sub", "Pitching Substitution", "Game Advisory", "Pitching Substitution", "Offensive Sub", ...
$ event_es (chr) "Aviso en el Juego", "Cambio Defensivo", "Cambio de Lanzador", "Aviso en el Juego", "Cambio de Lanzador", "Cambio Defe...
$ tfs (chr) "001915", "002757", "003536", "004848", "004904", "004924", "005010", "005949", "010446", "011004", "011539", "011723"...
$ tfs_zulu (chr) "2015-05-21T00:19:15Z", "2015-05-21T00:27:57Z", "2015-05-21T00:35:36Z", "2015-05-21T00:48:48Z", "2015-05-21T00:49:04Z"...
$ player (chr) "519306", "611177", "501593", "425784", "594792", "545338", "435064", "542994", "544993", "465674", "430948", "607054"...
$ pitch (chr) "3", "3", "5", "4", "4", "4", "2", "4", "2", "2", "10", "3", "4", "4", "1", "6", "6"
$ event_num (chr) "253", "283", "306", "334", "336", "338", "347", "386", "404", "420", "442", "449", "456", "458", "490", "515", "532"
$ home_team_runs (chr) "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"
$ away_team_runs (chr) "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"
$ play_guid (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "c99d1e25-008f-4c46-aff0-d9e9e8548664", NA, NA, NA, NA, NA
$ game_id (chr) "gid_2015_05_20_tbamlb_atlmlb_1", "gid_2015_05_20_tbamlb_atlmlb_1", "gid_2015_05_20_tbamlb_atlmlb_1", "gid_2015_05_20_...
$ atbat_num (chr) "1","12","13","24","24","34","35...
非常感谢任何帮助!!谢谢!
答案 0 :(得分:0)
我想出了一种方法,但我有更好的方法来做到这一点。
mlb <- read_xml(innallURL)
这条线抓住了局的底部 tactions&lt; - try(xml_find_all(mlb,“// bottom”),silent = TRUE)
if (inherits(tactions, "try-error") |
length(tactions) == 0) next
# make a giant data frame -------------------------------------------------
bind_rows(lapply(tactions, function(x) {
这条线抓住了局底的一切 pitches&lt; - try(xml_find_all(x,“。/ *”),silent = FALSE)
if (inherits(pitches, "try-error") |
length(pitches) == 0) return(NULL)
bind_rows(lapply(pitches, function(y) {
data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
})) -> pitch_dat
pitch_dat
})) -> bats
抓住并添加gameid
game_id = substr(innallURL, 66, 95)
bats$game_id <- game_id
创建数据框
batsdf <- data.frame(bats)
将每个循环添加到batsdf
allbats <- bind_rows(batsdf, allbats)
##################################################
# SEARCH TOP INNING
#
##################################################
抓住每局的所有顶部 actions&lt; - try(xml_find_all(mlb,“// top”),silent = TRUE)
if (inherits(actions, "try-error") |
length(actions) == 0) next
# make a giant data frame -------------------------------------------------
bind_rows(lapply(actions, function(x) {
抓住每局下方的所有内容 pitches&lt; - try(xml_find_all(x,“。/ *”),silent = FALSE)
if (inherits(pitches, "try-error") |
length(pitches) == 0) return(NULL)
bind_rows(lapply(pitches, function(y) {
data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
})) -> pitch_dat
pitch_dat
})) -> bats
抓住游戏ID并添加 game_id = substr(innallURL,66,95) 蝙蝠$ game_id&lt; - game_id
转换为数据框 batsdf&lt; - data.frame(bats) allbats&lt; - bind_rows(batsdf,allbats)
使用zoo库从下一行抓取atbat num
allbats$num <- na.locf(allbats$num, fromLast=TRUE)
删除start_tfs列中没有na的所有行,只留下操作
bottomAllbats <- subset(allbats, is.na(start_tfs))
##################################################
# REMOVE COLUMNS WITH ALL NA's
#
##################################################
删除从沥青和跑步者节点留下的所有列,其中na的数量等于数据框中的行数,只留下动作表中有数据的列。
tactiondf <- bottomAllbats[, colSums(is.na(bottomAllbats)) != nrow(bottomAllbats)]