使用parent和children属性值解析xml

时间:2015-08-27 01:32:15

标签: xml r xml-parsing

我正在尝试解析以下xml文件,并将atbat num属性值与data.frame中的每个音高记录放在一起。

Full MLB Gameday XML File

<atbat num="1" b="1" s="3" o="1" start_tfs="231201" start_tfs_zulu="2014-06-  10T23:12:01Z" batter="571697" stand="L" b_height="5-10" pitcher="493137" p_throws="R" des="Scooter Gennett called out on strikes. " des_es="Scooter Gennett se poncha sin tirarle. " event="Strikeout">

<pitch des="Ball" des_es="Bola mala" id="3" type="B" tfs="231236" tfs_zulu="2014-06-10T23:12:36Z" x="148.50" y="111.39" sv_id="140610_191405" start_speed="88.9" end_speed="81.1" sz_top="3.49" sz_bot="1.77" pfx_x="-7.11" pfx_z="6.3" px="-1.447" pz="3.935" x0="-2.51" y0="50.0" z0="5.896" vx0="5.084" vy0="-130.343" vz0="-0.852" ax="-12.102" ay="29.981" az="-21.38" break_y="23.7" break_angle="26.9" break_length="5.8" pitch_type="FT" type_confidence="2.000" zone="11" nasty="35" spin_dir="228.269" spin_rate="1804.956" cc="" mt=""/>
<pitch des="Called Strike" des_es="Strike cantado" id="4" type="S" tfs="231250" tfs_zulu="2014-06-10T23:12:50Z" x="85.84" y="158.88" sv_id="140610_191419" start_speed="90.7" end_speed="83.4" sz_top="3.31" sz_bot="1.44" pfx_x="-6.26" pfx_z="7.63" px="0.402" pz="1.822" x0="-2.405" y0="50.0" z0="5.533" vx0="9.46" vy0="-132.494" vz0="-6.149" ax="-11.109" ay="28.773" az="-18.554" break_y="23.8" break_angle="24.7" break_length="5.0" pitch_type="FF" type_confidence=".874" zone="9" nasty="45" spin_dir="219.201" spin_rate="1924.531" cc="" mt=""/>
<pitch des="Called Strike" des_es="Strike cantado" id="5" type="S" tfs="231308" tfs_zulu="2014-06-10T23:13:08Z" x="105.58" y="166.65" sv_id="140610_191437" start_speed="80.4" end_speed="74.1" sz_top="3.2" sz_bot="1.41" pfx_x="9.8" pfx_z="2.15" px="-0.242" pz="1.644" x0="-2.525" y0="50.0" z0="5.977" vx0="2.346" vy0="-117.836" vz0="-3.748" ax="13.625" ay="24.687" az="-29.117" break_y="23.8" break_angle="-25.1" break_length="9.6" pitch_type="SL" type_confidence="2.000" zone="7" nasty="34" spin_dir="102.646" spin_rate="1719.198" cc="" mt=""/>
<pitch des="Foul" des_es="Foul" id="6" type="S" tfs="231325" tfs_zulu="2014-06-10T23:13:25Z" x="125.32" y="132.97" sv_id="140610_191454" start_speed="91.2" end_speed="83.4" sz_top="3.13" sz_bot="1.44" pfx_x="-4.45" pfx_z="7.42" px="-0.822" pz="2.988" x0="-2.524" y0="50.0" z0="5.617" vx0="5.993" vy0="-133.61" vz0="-3.337" ax="-7.988" ay="30.874" az="-18.794" break_y="23.7" break_angle="18.8" break_length="4.7" pitch_type="FF" type_confidence="2.000" zone="11" nasty="59" spin_dir="210.836" spin_rate="1692.064" cc="" mt=""/>
<pitch des="Called Strike" des_es="Strike cantado" id="7" type="S" tfs="231351" tfs_zulu="2014-06-10T23:13:51Z" x="123.61" y="161.47" sv_id="140610_191520" start_speed="89.7" end_speed="83.2" sz_top="3.27" sz_bot="1.52" pfx_x="0.14" pfx_z="7.56" px="-0.796" pz="1.706" x0="-2.612" y0="50.0" z0="5.657" vx0="4.67" vy0="-131.367" vz0="-6.647" ax="0.25" ay="26.547" az="-18.826" break_y="23.8" break_angle="-3.2" break_length="4.7" pitch_type="FC" type_confidence="2.000" zone="13" nasty="62" spin_dir="178.929" spin_rate="1474.376" cc="" mt=""/>

</atbat>

下面是我用来将xml解析为data.frame的r脚本,它运行正常。我已经注释掉了atbat num行,因为所有这一切都会给我一个行号不匹配。

library(XML)
library(dplyr)
library(plyr)
library(RMySQL)

require(XML)

baseURL <- 'http://gd2.mlb.com/components/game/'
testURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/game_events.xml'
inningallURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/inning/inning_all.xml'

data <- xmlTreeParse(testURL, useInternalNodes = TRUE)
ia_data <- xmlTreeParse(inningallURL, useInternalNodes = TRUE)


inningall_df <- data.frame (

  game_id = substr(inningallURL, 66, 95),
  #num = xpathSApply(ia_data, '//*/atbat', xmlGetAttr,'num'),
  des = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'des'),
  des_es = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'des_es'),
  id = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'id'),
  type = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'type'),
  tfs = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'tfs'),
  tfs_zulu = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'tfs_zulu'),
  x = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'x'),
  y = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'y'),
  sv_id = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'sv_id'),
  start_speed = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'start_speed'),
  end_speed = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'end_speed'),
  sz_top = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'sz_top'),
  sz_bot = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'sz_bot'),
  pfx_x = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pfx_x'),
  pfx_z = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pfx_z'),
  px = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'px'),
  pz = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pz'),
  x0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'x0'),
  y0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'y0'),
  z0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'z0'),
  vx0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'vx0'),
  vy0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'vy0'),
  vz0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'vz0'),
  ax = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'ax'),
  ay = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'ay'),
  az = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'az'),
  break_y = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'break_y'),
  break_angle = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'break_angle'),
  break_length = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'break_length'),
  pitch_type = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pitch_type'),
  type_confidence = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'type_confidence'),
  zone = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'zone'),
  nasty = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'nasty'),
  spin_dir = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'spin_dir'),
  spin_rate = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'spin_rate'),
  cc = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'cc'),
  mt = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'mt')

)

我搜索并找到了几个示例,其中一个here看起来应该可以正常工作,但我所做的就是成功破解了为data.frame工作的代码。哈!我想最终得到的是一个数据框,其下面的每个孩子都有重复的at bat值。为了挽救你的眼睛,我放弃了大部分的柱子。

                       game_id    atbatnum                   des                        des_es  id type
gid_2014_06_10_milmlb_nynmlb_1       1                      Ball                     Bola mala   3    B
gid_2014_06_10_milmlb_nynmlb_1       1             Called Strike                Strike cantado   4    S
gid_2014_06_10_milmlb_nynmlb_1       1             Called Strike                Strike cantado   5    S
gid_2014_06_10_milmlb_nynmlb_1       1                      Foul                          Foul   6    S
gid_2014_06_10_milmlb_nynmlb_1       1             Called Strike                Strike cantado   7    S
gid_2014_06_10_milmlb_nynmlb_1       2             Called Strike                Strike cantado  11    S
gid_2014_06_10_milmlb_nynmlb_1       2                      Ball                     Bola mala  12    B
gid_2014_06_10_milmlb_nynmlb_1       2                      Ball                     Bola mala  13    B
gid_2014_06_10_milmlb_nynmlb_1       2                      Ball                     Bola mala  14    B
gid_2014_06_10_milmlb_nynmlb_1       2                      Ball                     Bola mala  15    B

1 个答案:

答案 0 :(得分:1)

这会使用xml2而不是XML并制作一个“大”数据框,其中包含每个参与者的所有投球:

library(xml2)
library(dplyr)

# URLs --------------------------------------------------------------------

baseURL <- 'http://gd2.mlb.com/components/game/'
testURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/game_events.xml'
inningallURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/inning/inning_all.xml'

# Get your file -----------------------------------------------------------

fil <- basename(inningallURL)
if(!file.exists(fil)) download.file(inningallURL, fil)

mlb <- read_xml(fil)

# get all at-bats ---------------------------------------------------------

atbat <- xml_find_all(mlb, "//atbat")

# make a giant data frame -------------------------------------------------

bind_rows(lapply(atbat, function(x) {

  pitches <- try(xml_find_all(x, "./pitch"), silent=FALSE)

  if (inherits(pitches, "try-error") |
      length(pitches) == 0) return(NULL)

  bind_rows(lapply(pitches, function(y) {
    data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
  })) -> pitch_dat

  atbat_num <- try(xml_attr(x, "num"))

  if (inherits(atbat_num, "try-error") |
      length(atbat_num) == 0) return(NULL)

  pitch_dat$atbat_num <- atbat_num

  pitch_dat

})) -> bats

# Add game id -------------------------------------------------------------

game_id <- substr(inningallURL, 66, 95)
bats$game_id <- game_id

# take a look -------------------------------------------------------------

glimpse(bats)

## Observations: 271
## Variables:
## $ des             (chr) "Ball", "Called Strike", "Called Strike", "Foul", "Called ...
## $ des_es          (chr) "Bola mala", "Strike cantado", "Strike cantado", "Foul", "...
## $ id              (chr) "3", "4", "5", "6", "7", "11", "12", "13", "14", "15", "19...
## $ type            (chr) "B", "S", "S", "S", "S", "S", "B", "B", "B", "B", "S", "X"...
## $ tfs             (chr) "231236", "231250", "231308", "231325", "231351", "231427"...
## $ tfs_zulu        (chr) "2014-06-10T23:12:36Z", "2014-06-10T23:12:50Z", "2014-06-1...
## $ x               (chr) "148.50", "85.84", "105.58", "125.32", "123.61", "84.98", ...
## $ y               (chr) "111.39", "158.88", "166.65", "132.97", "161.47", "159.74"...
## $ sv_id           (chr) "140610_191405", "140610_191419", "140610_191437", "140610...
## $ start_speed     (chr) "88.9", "90.7", "80.4", "91.2", "89.7", "92.2", "90.5", "8...
## $ end_speed       (chr) "81.1", "83.4", "74.1", "83.4", "83.2", "83.5", "83.4", "7...
## $ sz_top          (chr) "3.49", "3.31", "3.2", "3.13", "3.27", "3.37", "3.63", "3....
## $ sz_bot          (chr) "1.77", "1.44", "1.41", "1.44", "1.52", "1.65", "1.7", "1....
## $ pfx_x           (chr) "-7.11", "-6.26", "9.8", "-4.45", "0.14", "-9.74", "-1.19"...
## $ pfx_z           (chr) "6.3", "7.63", "2.15", "7.42", "7.56", "7.86", "5.74", "-2...
## $ px              (chr) "-1.447", "0.402", "-0.242", "-0.822", "-0.796", "0.454", ...
## $ pz              (chr) "3.935", "1.822", "1.644", "2.988", "1.706", "1.881", "2.2...
## $ x0              (chr) "-2.51", "-2.405", "-2.525", "-2.524", "-2.612", "-2.221",...
## $ y0              (chr) "50.0", "50.0", "50.0", "50.0", "50.0", "50.0", "50.0", "5...
## $ z0              (chr) "5.896", "5.533", "5.977", "5.617", "5.657", "5.528", "5.6...
## $ vx0             (chr) "5.084", "9.46", "2.346", "5.993", "4.67", "10.388", "4.62...
## $ vy0             (chr) "-130.343", "-132.494", "-117.836", "-133.61", "-131.367",...
## $ vz0             (chr) "-0.852", "-6.149", "-3.748", "-3.337", "-6.647", "-6.23",...
## $ ax              (chr) "-12.102", "-11.109", "13.625", "-7.988", "0.25", "-17.572...
## $ ay              (chr) "29.981", "28.773", "24.687", "30.874", "26.547", "34.419"...
## $ az              (chr) "-21.38", "-18.554", "-29.117", "-18.794", "-18.826", "-17...
## $ break_y         (chr) "23.7", "23.8", "23.8", "23.7", "23.8", "23.7", "23.8", "2...
## $ break_angle     (chr) "26.9", "24.7", "-25.1", "18.8", "-3.2", "37.6", "2.9", "-...
## $ break_length    (chr) "5.8", "5.0", "9.6", "4.7", "4.7", "5.6", "5.2", "11.2", "...
## $ pitch_type      (chr) "FT", "FF", "SL", "FF", "FC", "FT", "FF", "SL", "FF", "FF"...
## $ type_confidence (chr) "2.000", ".874", "2.000", "2.000", "2.000", "2.000", "2.00...
## $ zone            (chr) "11", "9", "7", "11", "13", "9", "13", "11", "11", "14", "...
## $ nasty           (chr) "35", "45", "34", "59", "62", "56", "58", "60", "64", "25"...
## $ spin_dir        (chr) "228.269", "219.201", "102.646", "210.836", "178.929", "23...
## $ spin_rate       (chr) "1804.956", "1924.531", "1719.198", "1692.064", "1474.376"...
## $ cc              (chr) "", "", "", "", "", "", "", "", "", "Daisuke Matsuzaka is ...
## $ mt              (chr) "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""...
## $ atbat_num       (chr) "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "3", "3"...
## $ on_1b           (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "460075", "460075"...
## $ on_2b           (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "4...
## $ on_3b           (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ game_id         (chr) "gid_2014_06_10_milmlb_nynmlb_1", "gid_2014_06_10_milmlb_n...