我正在尝试解析以下xml文件,并将atbat num属性值与data.frame中的每个音高记录放在一起。
<atbat num="1" b="1" s="3" o="1" start_tfs="231201" start_tfs_zulu="2014-06- 10T23:12:01Z" batter="571697" stand="L" b_height="5-10" pitcher="493137" p_throws="R" des="Scooter Gennett called out on strikes. " des_es="Scooter Gennett se poncha sin tirarle. " event="Strikeout">
<pitch des="Ball" des_es="Bola mala" id="3" type="B" tfs="231236" tfs_zulu="2014-06-10T23:12:36Z" x="148.50" y="111.39" sv_id="140610_191405" start_speed="88.9" end_speed="81.1" sz_top="3.49" sz_bot="1.77" pfx_x="-7.11" pfx_z="6.3" px="-1.447" pz="3.935" x0="-2.51" y0="50.0" z0="5.896" vx0="5.084" vy0="-130.343" vz0="-0.852" ax="-12.102" ay="29.981" az="-21.38" break_y="23.7" break_angle="26.9" break_length="5.8" pitch_type="FT" type_confidence="2.000" zone="11" nasty="35" spin_dir="228.269" spin_rate="1804.956" cc="" mt=""/>
<pitch des="Called Strike" des_es="Strike cantado" id="4" type="S" tfs="231250" tfs_zulu="2014-06-10T23:12:50Z" x="85.84" y="158.88" sv_id="140610_191419" start_speed="90.7" end_speed="83.4" sz_top="3.31" sz_bot="1.44" pfx_x="-6.26" pfx_z="7.63" px="0.402" pz="1.822" x0="-2.405" y0="50.0" z0="5.533" vx0="9.46" vy0="-132.494" vz0="-6.149" ax="-11.109" ay="28.773" az="-18.554" break_y="23.8" break_angle="24.7" break_length="5.0" pitch_type="FF" type_confidence=".874" zone="9" nasty="45" spin_dir="219.201" spin_rate="1924.531" cc="" mt=""/>
<pitch des="Called Strike" des_es="Strike cantado" id="5" type="S" tfs="231308" tfs_zulu="2014-06-10T23:13:08Z" x="105.58" y="166.65" sv_id="140610_191437" start_speed="80.4" end_speed="74.1" sz_top="3.2" sz_bot="1.41" pfx_x="9.8" pfx_z="2.15" px="-0.242" pz="1.644" x0="-2.525" y0="50.0" z0="5.977" vx0="2.346" vy0="-117.836" vz0="-3.748" ax="13.625" ay="24.687" az="-29.117" break_y="23.8" break_angle="-25.1" break_length="9.6" pitch_type="SL" type_confidence="2.000" zone="7" nasty="34" spin_dir="102.646" spin_rate="1719.198" cc="" mt=""/>
<pitch des="Foul" des_es="Foul" id="6" type="S" tfs="231325" tfs_zulu="2014-06-10T23:13:25Z" x="125.32" y="132.97" sv_id="140610_191454" start_speed="91.2" end_speed="83.4" sz_top="3.13" sz_bot="1.44" pfx_x="-4.45" pfx_z="7.42" px="-0.822" pz="2.988" x0="-2.524" y0="50.0" z0="5.617" vx0="5.993" vy0="-133.61" vz0="-3.337" ax="-7.988" ay="30.874" az="-18.794" break_y="23.7" break_angle="18.8" break_length="4.7" pitch_type="FF" type_confidence="2.000" zone="11" nasty="59" spin_dir="210.836" spin_rate="1692.064" cc="" mt=""/>
<pitch des="Called Strike" des_es="Strike cantado" id="7" type="S" tfs="231351" tfs_zulu="2014-06-10T23:13:51Z" x="123.61" y="161.47" sv_id="140610_191520" start_speed="89.7" end_speed="83.2" sz_top="3.27" sz_bot="1.52" pfx_x="0.14" pfx_z="7.56" px="-0.796" pz="1.706" x0="-2.612" y0="50.0" z0="5.657" vx0="4.67" vy0="-131.367" vz0="-6.647" ax="0.25" ay="26.547" az="-18.826" break_y="23.8" break_angle="-3.2" break_length="4.7" pitch_type="FC" type_confidence="2.000" zone="13" nasty="62" spin_dir="178.929" spin_rate="1474.376" cc="" mt=""/>
</atbat>
下面是我用来将xml解析为data.frame的r脚本,它运行正常。我已经注释掉了atbat num行,因为所有这一切都会给我一个行号不匹配。
library(XML)
library(dplyr)
library(plyr)
library(RMySQL)
require(XML)
baseURL <- 'http://gd2.mlb.com/components/game/'
testURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/game_events.xml'
inningallURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/inning/inning_all.xml'
data <- xmlTreeParse(testURL, useInternalNodes = TRUE)
ia_data <- xmlTreeParse(inningallURL, useInternalNodes = TRUE)
inningall_df <- data.frame (
game_id = substr(inningallURL, 66, 95),
#num = xpathSApply(ia_data, '//*/atbat', xmlGetAttr,'num'),
des = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'des'),
des_es = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'des_es'),
id = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'id'),
type = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'type'),
tfs = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'tfs'),
tfs_zulu = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'tfs_zulu'),
x = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'x'),
y = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'y'),
sv_id = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'sv_id'),
start_speed = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'start_speed'),
end_speed = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'end_speed'),
sz_top = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'sz_top'),
sz_bot = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'sz_bot'),
pfx_x = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pfx_x'),
pfx_z = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pfx_z'),
px = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'px'),
pz = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pz'),
x0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'x0'),
y0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'y0'),
z0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'z0'),
vx0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'vx0'),
vy0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'vy0'),
vz0 = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'vz0'),
ax = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'ax'),
ay = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'ay'),
az = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'az'),
break_y = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'break_y'),
break_angle = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'break_angle'),
break_length = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'break_length'),
pitch_type = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'pitch_type'),
type_confidence = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'type_confidence'),
zone = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'zone'),
nasty = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'nasty'),
spin_dir = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'spin_dir'),
spin_rate = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'spin_rate'),
cc = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'cc'),
mt = xpathSApply(ia_data, '//*/pitch', xmlGetAttr,'mt')
)
我搜索并找到了几个示例,其中一个here看起来应该可以正常工作,但我所做的就是成功破解了为data.frame工作的代码。哈!我想最终得到的是一个数据框,其下面的每个孩子都有重复的at bat值。为了挽救你的眼睛,我放弃了大部分的柱子。
game_id atbatnum des des_es id type
gid_2014_06_10_milmlb_nynmlb_1 1 Ball Bola mala 3 B
gid_2014_06_10_milmlb_nynmlb_1 1 Called Strike Strike cantado 4 S
gid_2014_06_10_milmlb_nynmlb_1 1 Called Strike Strike cantado 5 S
gid_2014_06_10_milmlb_nynmlb_1 1 Foul Foul 6 S
gid_2014_06_10_milmlb_nynmlb_1 1 Called Strike Strike cantado 7 S
gid_2014_06_10_milmlb_nynmlb_1 2 Called Strike Strike cantado 11 S
gid_2014_06_10_milmlb_nynmlb_1 2 Ball Bola mala 12 B
gid_2014_06_10_milmlb_nynmlb_1 2 Ball Bola mala 13 B
gid_2014_06_10_milmlb_nynmlb_1 2 Ball Bola mala 14 B
gid_2014_06_10_milmlb_nynmlb_1 2 Ball Bola mala 15 B
答案 0 :(得分:1)
这会使用xml2
而不是XML
并制作一个“大”数据框,其中包含每个参与者的所有投球:
library(xml2)
library(dplyr)
# URLs --------------------------------------------------------------------
baseURL <- 'http://gd2.mlb.com/components/game/'
testURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/game_events.xml'
inningallURL <- 'http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_10/gid_2014_06_10_milmlb_nynmlb_1/inning/inning_all.xml'
# Get your file -----------------------------------------------------------
fil <- basename(inningallURL)
if(!file.exists(fil)) download.file(inningallURL, fil)
mlb <- read_xml(fil)
# get all at-bats ---------------------------------------------------------
atbat <- xml_find_all(mlb, "//atbat")
# make a giant data frame -------------------------------------------------
bind_rows(lapply(atbat, function(x) {
pitches <- try(xml_find_all(x, "./pitch"), silent=FALSE)
if (inherits(pitches, "try-error") |
length(pitches) == 0) return(NULL)
bind_rows(lapply(pitches, function(y) {
data.frame(t(xml_attrs(y)), stringsAsFactors=FALSE)
})) -> pitch_dat
atbat_num <- try(xml_attr(x, "num"))
if (inherits(atbat_num, "try-error") |
length(atbat_num) == 0) return(NULL)
pitch_dat$atbat_num <- atbat_num
pitch_dat
})) -> bats
# Add game id -------------------------------------------------------------
game_id <- substr(inningallURL, 66, 95)
bats$game_id <- game_id
# take a look -------------------------------------------------------------
glimpse(bats)
## Observations: 271
## Variables:
## $ des (chr) "Ball", "Called Strike", "Called Strike", "Foul", "Called ...
## $ des_es (chr) "Bola mala", "Strike cantado", "Strike cantado", "Foul", "...
## $ id (chr) "3", "4", "5", "6", "7", "11", "12", "13", "14", "15", "19...
## $ type (chr) "B", "S", "S", "S", "S", "S", "B", "B", "B", "B", "S", "X"...
## $ tfs (chr) "231236", "231250", "231308", "231325", "231351", "231427"...
## $ tfs_zulu (chr) "2014-06-10T23:12:36Z", "2014-06-10T23:12:50Z", "2014-06-1...
## $ x (chr) "148.50", "85.84", "105.58", "125.32", "123.61", "84.98", ...
## $ y (chr) "111.39", "158.88", "166.65", "132.97", "161.47", "159.74"...
## $ sv_id (chr) "140610_191405", "140610_191419", "140610_191437", "140610...
## $ start_speed (chr) "88.9", "90.7", "80.4", "91.2", "89.7", "92.2", "90.5", "8...
## $ end_speed (chr) "81.1", "83.4", "74.1", "83.4", "83.2", "83.5", "83.4", "7...
## $ sz_top (chr) "3.49", "3.31", "3.2", "3.13", "3.27", "3.37", "3.63", "3....
## $ sz_bot (chr) "1.77", "1.44", "1.41", "1.44", "1.52", "1.65", "1.7", "1....
## $ pfx_x (chr) "-7.11", "-6.26", "9.8", "-4.45", "0.14", "-9.74", "-1.19"...
## $ pfx_z (chr) "6.3", "7.63", "2.15", "7.42", "7.56", "7.86", "5.74", "-2...
## $ px (chr) "-1.447", "0.402", "-0.242", "-0.822", "-0.796", "0.454", ...
## $ pz (chr) "3.935", "1.822", "1.644", "2.988", "1.706", "1.881", "2.2...
## $ x0 (chr) "-2.51", "-2.405", "-2.525", "-2.524", "-2.612", "-2.221",...
## $ y0 (chr) "50.0", "50.0", "50.0", "50.0", "50.0", "50.0", "50.0", "5...
## $ z0 (chr) "5.896", "5.533", "5.977", "5.617", "5.657", "5.528", "5.6...
## $ vx0 (chr) "5.084", "9.46", "2.346", "5.993", "4.67", "10.388", "4.62...
## $ vy0 (chr) "-130.343", "-132.494", "-117.836", "-133.61", "-131.367",...
## $ vz0 (chr) "-0.852", "-6.149", "-3.748", "-3.337", "-6.647", "-6.23",...
## $ ax (chr) "-12.102", "-11.109", "13.625", "-7.988", "0.25", "-17.572...
## $ ay (chr) "29.981", "28.773", "24.687", "30.874", "26.547", "34.419"...
## $ az (chr) "-21.38", "-18.554", "-29.117", "-18.794", "-18.826", "-17...
## $ break_y (chr) "23.7", "23.8", "23.8", "23.7", "23.8", "23.7", "23.8", "2...
## $ break_angle (chr) "26.9", "24.7", "-25.1", "18.8", "-3.2", "37.6", "2.9", "-...
## $ break_length (chr) "5.8", "5.0", "9.6", "4.7", "4.7", "5.6", "5.2", "11.2", "...
## $ pitch_type (chr) "FT", "FF", "SL", "FF", "FC", "FT", "FF", "SL", "FF", "FF"...
## $ type_confidence (chr) "2.000", ".874", "2.000", "2.000", "2.000", "2.000", "2.00...
## $ zone (chr) "11", "9", "7", "11", "13", "9", "13", "11", "11", "14", "...
## $ nasty (chr) "35", "45", "34", "59", "62", "56", "58", "60", "64", "25"...
## $ spin_dir (chr) "228.269", "219.201", "102.646", "210.836", "178.929", "23...
## $ spin_rate (chr) "1804.956", "1924.531", "1719.198", "1692.064", "1474.376"...
## $ cc (chr) "", "", "", "", "", "", "", "", "", "Daisuke Matsuzaka is ...
## $ mt (chr) "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""...
## $ atbat_num (chr) "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "3", "3"...
## $ on_1b (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "460075", "460075"...
## $ on_2b (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "4...
## $ on_3b (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ game_id (chr) "gid_2014_06_10_milmlb_nynmlb_1", "gid_2014_06_10_milmlb_n...