r xml在特定节点后获取以下的attrib值

时间:2015-09-04 15:22:24

标签: r xml-parsing libxml2 lapply

我正在尝试从xml文件中解析所有操作节点,并添加以下atbat num值。我在下面有一段xml文件。我有代码工作来抓取所有的操作,我只是无法弄清楚如何将以下atbat num添加到列表/数据帧。我相信它是一个兄弟节点,而不是父/子节点,因为这些动作不是嵌套在一个atbat节点下。我正在使用的代码是这个我试图将follow-sibling合并为不等于action,但不仅可以返回错误。它应该从每个动作节点中提取所有值,附加gameid(它已经执行此操作),并附加以下atbat num值。

url <- "http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_20/gid_2015_05_20_tbamlb_atlmlb_1/inning/inning_all.xml"


mlb <- read_xml(url)

# get all at-bats ---------------------------------------------------------

atbat <- try(xml_find_all(mlb, "//action"), silent = FALSE)

bind_rows(lapply(atbat, function(y) {
    data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
})) -> pitch_dat

game_id = substr(url, 66, 95)
pitch_dat$game_id <- game_id

batsdf <- data.frame(pitch_dat)

xml示例

<atbat num="44" b="4" s="0" o="2" start_tfs="004231" start_tfs_zulu="2015-05-21T00:42:31Z" batter="452678" stand="R" b_height="6-0" pitcher="501593" p_throws="L" des="Luis Avilan intentionally walks Asdrubal Cabrera. " des_es="Luis Avilan recibe base por bolas intencional Asdrubal Cabrera. " event_num="333" event="Intent Walk" event_es="Base por Bolas Intencional" play_guid="86cdbf1e-049e-4eb6-9314-85cfb3e6e28a" home_team_runs="1" away_team_runs="1">...</atbat>
<action b="4" s="0" o="2" des="Coaching visit to mound. " des_es="Visita del Instructor a la Lomita" event="Game Advisory" event_es="Aviso en el Juego" tfs="004848" tfs_zulu="2015-05-21T00:48:48Z" player="425784" pitch="4" event_num="334" home_team_runs="1" away_team_runs="1"/>
<action b="0" s="0" o="2" des="Pitching Change: Brandon Cunniff replaces Luis Avilan, batting 9th. " des_es="Cambio de Lanzador: Brandon Cunniff reemplaza a Luis Avilan, bateando noveno. " event="Pitching Substitution" event_es="Cambio de Lanzador" tfs="004904" tfs_zulu="2015-05-21T00:49:04Z" player="594792" pitch="4" event_num="336" home_team_runs="1" away_team_runs="1"/>
<action b="0" s="0" o="2" des="Offensive Substitution: Pinch-hitter Nick Franklin replaces Rene Rivera. " des_es="Sustitución a la ofensiva: bateador emergente Nick Franklin reemplaza a Rene Rivera. " event="Offensive Sub" event_es="Cambio Defensivo" tfs="004924" tfs_zulu="2015-05-21T00:49:24Z" player="545338" pitch="4" event_num="338" home_team_runs="1" away_team_runs="1"/>
<atbat num="45" b="0" s="1" o="3" start_tfs="004934" start_tfs_zulu="2015-05-21T00:49:34Z" batter="545338" stand="L" b_height="6-1" pitcher="594792" p_throws="R" des="Nick Franklin grounds out softly to first baseman Freddie Freeman. " des_es="Nick Franklin batea rodado de out suavemente a primera base Freddie Freeman. " event_num="343" event="Groundout" event_es="Roletazo de Out" home_team_runs="1" away_team_runs="1">...</atbat>

Full XML

我想最终得到的结果是,atbatnum字段填充了用于示例目的的补充数据。

Variables:
$ b              (chr) "1", "0", "0", "4", "0", "0", "0", "0", "0", "0", "4", "1", "0", "0", "0", "0", "0"
$ s              (chr) "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "2", "2", "0", "0", "0", "0", "0"
$ o              (chr) "1", "2", "0", "2", "2", "2", "0", "0", "0", "1", "2", "2", "0", "0", "0", "0", "2"
$ des            (chr) "Coaching visit to mound.  ", "Offensive Substitution: Pinch-hitter Adonis Garcia replaces Williams Perez.  ", "Pitchi...
$ des_es         (chr) "Visita del Instructor a la Lomita", "Sustitución a la ofensiva: bateador emergente Adonis Garcia reemplaza a Williams...
$ event          (chr) "Game Advisory", "Offensive Sub", "Pitching Substitution", "Game Advisory", "Pitching Substitution", "Offensive Sub", ...
$ event_es       (chr) "Aviso en el Juego", "Cambio Defensivo", "Cambio de Lanzador", "Aviso en el Juego", "Cambio de Lanzador", "Cambio Defe...
$ tfs            (chr) "001915", "002757", "003536", "004848", "004904", "004924", "005010", "005949", "010446", "011004", "011539", "011723"...
$ tfs_zulu       (chr) "2015-05-21T00:19:15Z", "2015-05-21T00:27:57Z", "2015-05-21T00:35:36Z", "2015-05-21T00:48:48Z", "2015-05-21T00:49:04Z"...
$ player         (chr) "519306", "611177", "501593", "425784", "594792", "545338", "435064", "542994", "544993", "465674", "430948", "607054"...
$ pitch          (chr) "3", "3", "5", "4", "4", "4", "2", "4", "2", "2", "10", "3", "4", "4", "1", "6", "6"
$ event_num      (chr) "253", "283", "306", "334", "336", "338", "347", "386", "404", "420", "442", "449", "456", "458", "490", "515", "532"
$ home_team_runs (chr) "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", "2", "2", "2"
$ away_team_runs (chr) "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"
$ play_guid      (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "c99d1e25-008f-4c46-aff0-d9e9e8548664", NA, NA, NA, NA, NA
$ game_id        (chr) "gid_2015_05_20_tbamlb_atlmlb_1", "gid_2015_05_20_tbamlb_atlmlb_1", "gid_2015_05_20_tbamlb_atlmlb_1", "gid_2015_05_20_...
$ atbat_num      (chr)  "1","12","13","24","24","34","35...

非常感谢任何帮助!!谢谢!

1 个答案:

答案 0 :(得分:0)

我想出了一种方法,但我有更好的方法来做到这一点。

mlb <- read_xml(innallURL)

这条线抓住了局的底部     tactions&lt; - try(xml_find_all(mlb,“// bottom”),silent = TRUE)

if (inherits(tactions, "try-error") |
    length(tactions) == 0) next

# make a giant data frame -------------------------------------------------

bind_rows(lapply(tactions, function(x) {

这条线抓住了局底的一切       pitches&lt; - try(xml_find_all(x,“。/ *”),silent = FALSE)

  if (inherits(pitches, "try-error") |
      length(pitches) == 0) return(NULL)

  bind_rows(lapply(pitches, function(y) {
    data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
  })) -> pitch_dat

  pitch_dat

})) -> bats

抓住并添加gameid

game_id = substr(innallURL, 66, 95)
bats$game_id <- game_id

创建数据框

batsdf <- data.frame(bats)

将每个循环添加到batsdf

allbats <- bind_rows(batsdf, allbats)

##################################################
#         SEARCH TOP INNING
#
##################################################

抓住每局的所有顶部     actions&lt; - try(xml_find_all(mlb,“// top”),silent = TRUE)

if (inherits(actions, "try-error") |
    length(actions) == 0) next

# make a giant data frame -------------------------------------------------

bind_rows(lapply(actions, function(x) {

抓住每局下方的所有内容       pitches&lt; - try(xml_find_all(x,“。/ *”),silent = FALSE)

  if (inherits(pitches, "try-error") |
      length(pitches) == 0) return(NULL)

  bind_rows(lapply(pitches, function(y) {
    data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
  })) -> pitch_dat

  pitch_dat

})) -> bats

抓住游戏ID并添加     game_id = substr(innallURL,66,95)     蝙蝠$ game_id&lt; - game_id

转换为数据框     batsdf&lt; - data.frame(bats)     allbats&lt; - bind_rows(batsdf,allbats)

使用zoo库从下一行抓取atbat num

allbats$num <- na.locf(allbats$num, fromLast=TRUE)

删除start_tfs列中没有na的所有行,只留下操作

bottomAllbats <- subset(allbats, is.na(start_tfs))


##################################################
#         REMOVE COLUMNS WITH ALL NA's
#
##################################################

删除从沥青和跑步者节点留下的所有列,其中na的数量等于数据框中的行数,只留下动作表中有数据的列。

tactiondf <- bottomAllbats[, colSums(is.na(bottomAllbats)) != nrow(bottomAllbats)]