很抱歉,可能不清楚的地方。
我一直在尝试运行来自ballerker软件包(https://rdrr.io/github/BillPetti/baseballr/src/R/run_expectancy_code.R)的运行期望函数的循环,该循环为我拥有的10,091个唯一音高中的每一个计算159个可能状态下的运行值。但是我希望函数不根据孤立的音调来计算,而是根据所有音调组合后不断重新加权的数据帧进行计算。因此,对于任何给定的唯一音高,我都希望复制与该音高有关的行,以使其占最终数据帧的20%,然后我希望将音高(排名第一)的音高(并将此数据存储在一个单独的数据帧中,称为音高相似度)中-5在相似度中占另外20%,在#6-10中占10%,在11-30中占5%,其余音高则占50%。这仍然是实验性的,但是我想从这里开始。
我认为在设置权重方面取得了一些进展,但我不认为我一直在复制行,而且我最努力的是使此过程成为一个恒定的循环,以便每个唯一的音高在重新加权主数据帧后,可以计算其运行期望值。我也很难将所有的运行期望数据都显示在一个数据帧中–我不确定是否可以为该数据计算函数,然后重新称重,然后再次计算,依此类推,直到为所有螺距循环为止,所以我得到一个大数据框,最重要的是第三列,该列列出了用作混合主要音调的唯一音高,因此我可以区分。请注意,lefty_abs是主要数据帧,所有行均用于计算运行期望值,pitch_similarity是我的数据帧,其中所有音高最相似,相似度最高。为了方便起见,我将所有内容的dput样本的粘贴框放在最后。我还添加了一个音调数据框,其中仅列出了唯一的音调。
这是运行预期功能,以及我的重新加权尝试
library(dplyr)
run_expectancy_code <- function(df, level = "plate appearance") {
single_outs <- c("strikeout", "caught_stealing_2b",
"pickoff_caught_stealing_2b", "other_out",
"caught_stealing_3b", "caught_stealing_home",
"field_out", "force_out", "pickoff_1b",
"batter_interference", "fielders_choice",
"pickoff_2b", "pickoff_caught_stealing_3b",
"pickoff_caught_stealing_home")
df <- df %>%
dplyr::arrange(game_pk, at_bat_number, pitch_number) %>%
dplyr::group_by(game_pk) %>%
dplyr::mutate(final_pitch_game =
ifelse(pitch_number == max(pitch_number), 1, 0)) %>%
dplyr::ungroup() %>%
dplyr::group_by(game_pk, at_bat_number, inning_topbot) %>%
dplyr::mutate(final_pitch_at_bat = ifelse(pitch_number == max(pitch_number), 1, 0)) %>%
dplyr::ungroup()
df <- df %>%
dplyr::arrange(game_pk, inning_topbot, at_bat_number, pitch_number) %>%
dplyr::mutate(runs_scored_on_pitch = stringr::str_count(des, "scores"),
runs_scored_on_pitch =
ifelse(events == "home_run", runs_scored_on_pitch + 1,
runs_scored_on_pitch),
bat_score_after = bat_score + runs_scored_on_pitch) %>%
dplyr::arrange(game_pk, at_bat_number, pitch_number) %>%
dplyr::mutate(final_pitch_inning =
ifelse(final_pitch_at_bat == 1 &
inning_topbot != lead(inning_topbot), 1, 0),
final_pitch_inning = ifelse(is.na(final_pitch_inning),
1, final_pitch_inning))
if (level == "plate appearance") {
df <- df %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(bat_score_start_inning = min(bat_score),
bat_score_end_inning = max(bat_score),
cum_runs_in_inning = cumsum(runs_scored_on_pitch),
runs_to_end_inning = bat_score_end_inning - bat_score) %>%
dplyr::ungroup() %>%
dplyr::mutate(base_out_state = paste(outs_when_up, " outs, ",
ifelse(!is.na(.$on_1b), "1b", "_"),
ifelse(!is.na(.$on_2b), "2b", "_"),
ifelse(!is.na(.$on_3b), "3b", "_")))
re_table <- run_expectancy_table(df)
df <- df %>%
left_join(re_table, by = "base_out_state")
df <- df %>%
dplyr::filter(final_pitch_at_bat == 1) %>%
dplyr::arrange(game_pk, inning, inning_topbot) %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(next_base_out_state = dplyr::lead(base_out_state)) %>%
dplyr::ungroup() %>%
dplyr::left_join(re_table,
by = c("next_base_out_state" = "base_out_state")) %>%
dplyr::rename(next_avg_re = avg_re.y,
avg_re = avg_re.x) %>%
dplyr::mutate(next_avg_re = ifelse(is.na(next_avg_re), 0, next_avg_re),
change_re = next_avg_re - avg_re,
re24 = change_re + runs_scored_on_pitch) %>%
dplyr::arrange(game_pk, inning, inning_topbot)
} else {
df <- df %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(bat_score_start_inning = min(bat_score),
bat_score_end_inning = max(bat_score),
cum_runs_in_inning = cumsum(runs_scored_on_pitch),
runs_to_end_inning = bat_score_end_inning - bat_score) %>%
dplyr::ungroup() %>%
dplyr::mutate(count_base_out_state =
paste(balls, "-", strikes, ", ",
outs_when_up, " outs, ",
ifelse(!is.na(.$on_1b), "1b", "_"),
ifelse(!is.na(.$on_2b), "2b", "_"),
ifelse(!is.na(.$on_3b), "3b", "_")))
re_table <- run_expectancy_table(df, level = "pitch")
df <- df %>%
left_join(re_table, by = "count_base_out_state")
df <- df %>%
#dplyr::filter(final_pitch_at_bat == 1) %>%
dplyr::arrange(game_pk, inning, inning_topbot) %>%
dplyr::group_by(game_pk, inning, inning_topbot) %>%
dplyr::mutate(next_count_base_out_state =
dplyr::lead(count_base_out_state)) %>%
dplyr::ungroup() %>%
dplyr::left_join(re_table,
by = c("next_count_base_out_state" =
"count_base_out_state")) %>%
dplyr::rename(next_avg_re = avg_re.y,
avg_re = avg_re.x) %>%
dplyr::mutate(next_avg_re = ifelse(is.na(next_avg_re), 0, next_avg_re),
change_re = next_avg_re - avg_re,
re24 = change_re + runs_scored_on_pitch) %>%
dplyr::arrange(game_pk, inning, inning_topbot)
}
assign("run_expectancy_state_table", re_table, envir = .GlobalEnv)
df
}
run_expectancy_table <- function(df, level = "plate appearance") {
if (level == "plate appearance") {
df <- df %>%
dplyr::filter(final_pitch_at_bat == 1, inning < 9) %>%
dplyr::group_by(base_out_state) %>%
dplyr::summarise(avg_re = mean(runs_to_end_inning, na.rm = TRUE)) %>%
dplyr::arrange(desc(avg_re))
} else {
df <- df %>%
dplyr::filter(inning < 9) %>%
dplyr::group_by(count_base_out_state) %>%
dplyr::summarise(avg_re = mean(runs_to_end_inning, na.rm = TRUE)) %>%
dplyr::arrange(desc(avg_re))
}
df
}
# reweighting below
{twenty<-nrow(lefty_abs)*0.20
samedf<-lefty_abs[lefty_abs$pitch_key%in%pitch_similarity$pitch_1,]
if (twenty<=nrow(samedf)) {
twentydf<-samedf[1:twenty,]
} else {
twentydf<-matrix(nrow = twenty,ncol = ncol(lefty_abs))
twentydf<-lefty_abs[1:twenty,]
number<-floor(twenty/nrow(samedf))
a<-1
b<-nrow(samedf)
c<-b
for (i in 1:number) {
twentydf[a:c,]<-samedf
a<-a+b
c<-c+b
}
twentydf[a:twenty,]<-samedf[1:(twenty-a+1),]
}
onefive<-pitch_similarity[pitch_similarity$rank>=1 & pitch_similarity$rank<=5,]
same2df<-lefty_abs[lefty_abs$pitch_key%in%onefive$pitch_2,]
if (twenty<=nrow(same2df)) {
twenty2df<-same2df[1:twenty,]
} else {
twenty2df<-lefty_abs[1:twenty,]
number2<-floor(twenty/nrow(same2df))
d<-1
e<-nrow(same2df)
f<-e
for (i in 1:number2) {
twenty2df[d:f,]<-same2df
d<-d+e
f<-f+e
}
twenty2df[d:twenty,]<-same2df[1:(twenty-d+1),]
}
sixten<-pitch_similarity[pitch_similarity$rank>=6 & pitch_similarity$rank<=10,]
same3df<-lefty_abs[lefty_abs$pitch_key%in%sixten$pitch_2,]
if (twenty/4<=nrow(same3df)) {
twenty3df<-same3df[1:(twenty/4),]
} else {
twenty3df<-lefty_abs[1:(twenty/4),]
number3<-floor(twenty/4/nrow(same3df))
g<-1
h<-nrow(same3df)
j<-h
for (i in 1:number3) {
twenty3df[g:j,]<-same3df
g<-g+h
j<-j+h
}
twenty3df[g:(twenty/4),]<-same3df[1:(twenty/4-g+1),]
twenty3df<-twenty3df[1:(twenty/4),]
}
eleven<-pitch_similarity[pitch_similarity$rank>=11 & pitch_similarity$rank<=30,]
same4df<-lefty_abs[lefty_abs$pitch_key%in%eleven$pitch_2,]
if (twenty/4<=nrow(same4df)) {
twenty4df<-same4df[1:twenty/4,]
} else {
twenty4df<-lefty_abs[1:(twenty/4),]
number4<-floor(twenty/4/nrow(same4df))
k<-1
l<-nrow(same4df)
m<-l
for (i in 1:number4) {
twenty4df[k:m,]<-same4df
k<-k+l
m<-m+l
}
twenty4df[k:(twenty/4),]<-same4df[1:(twenty/4-k+1),]
twenty4df<-twenty4df[1:(twenty/4),]
}
newdf<-rbind.data.frame(twentydf,twenty2df,twenty3df,twenty4df)
rest<-lefty_abs[!lefty_abs$pitch_key %in% newdf$pitch_key,]
rest<-rest[1:(nrow(lefty_abs)/2),]
final<-rbind.data.frame(newdf,rest)
run_expectancy_code(final, level = "pitch")
}
采样采样
structure(list(sample_pitches = structure(c(9L, 8L, 10L, 7L,
1L, 2L, 4L, 6L, 3L, 5L), .Label = c("150037-FF", "218596-FF",
"218596-FS", "218596-SI", "346800-FC", "346800-FF", "349193-FT",
"400010-FF", "493247-SI", "493247-SL"), class = "factor")), row.names = c(NA,
-10L), class = "data.frame")
音高相似度采样头
> head(sample_similarity)
pitch_1 pitch_2 euclid_dist rank batter_handedness
1 493247-SI 434958-SI 0.4343468 4 L
2 493247-SI 503285-SI 0.6632168 13 L
3 493247-SI 448592-SI 0.7847100 20 L
4 493247-SI 434958-FF 0.7659369 17 L
5 493247-SI 501925-SI 0.2812055 1 L
样本左撇子的头部,因为即使dput也会很大-
> head(sample_lefty)
pitch_type game_date release_speed release_pos_x release_pos_z player_name batter
626 SI 2008-03-30 91.9 NA NA Peter Moylan 150217
627 SI 2008-03-30 91.0 NA NA Peter Moylan 150217
628 SI 2008-03-30 91.6 NA NA Peter Moylan 150217
629 SI 2008-03-30 89.6 NA NA Peter Moylan 150217
631 FF 2008-03-30 92.3 NA NA Jon Rauch 435263
pitcher events description spin_dir spin_rate_deprecated break_angle_deprecated
626 493247 strikeout foul_tip <NA> <NA> <NA>
627 493247 null foul <NA> <NA> <NA>
628 493247 null called_strike <NA> <NA> <NA>
629 493247 null ball <NA> <NA> <NA>
631 400010 field_out hit_into_play <NA> <NA> <NA>
break_length_deprecated zone des
626 <NA> 7 Cristian Guzman strikes out on a foul tip.
627 <NA> 8 null
628 <NA> 5 null
629 <NA> 13 null
631 <NA> 4 Brian McCann flies out to left fielder Willie Harris.
game_type stand p_throws home_team away_team type hit_location bb_type balls
626 R L R WSH ATL S 2 null 2
627 R L R WSH ATL S null null 2
628 R L R WSH ATL S null null 2
629 R L R WSH ATL B null null 1
631 R L R WSH ATL X 7 fly_ball 3
strikes game_year pfx_x pfx_z plate_x plate_z on_3b on_2b on_1b
626 2 2008 -1.1290775 -0.004106667 -0.402 1.789 NA NA NA
627 1 2008 -1.0738283 0.049930000 -0.014 1.658 NA NA NA
628 0 2008 -0.9675050 -0.107020000 0.203 2.070 NA NA NA
629 0 2008 -1.2086808 0.115863333 -0.724 1.017 NA NA NA
631 0 2008 -0.2232417 2.557116667 -0.742 2.725 NA NA NA
outs_when_up inning inning_topbot hc_x hc_y tfs_deprecated tfs_zulu_deprecated
626 0 9 Bot NA NA <NA> <NA>
627 0 9 Bot NA NA <NA> <NA>
628 0 9 Bot NA NA <NA> <NA>
629 0 9 Bot NA NA <NA> <NA>
631 2 9 Top 62.25 103.41 <NA> <NA>
fielder_2 umpire sv_id vx0 vy0 vz0 ax ay az sz_top
626 null <NA> 080330_224253 11.150 -133.360 -0.139 -12.709 33.238 -34.621 3.227
627 null <NA> 080330_224229 11.203 -132.113 -0.270 -11.476 39.266 -33.872 3.227
628 null <NA> 080330_224213 11.329 -132.914 1.120 -10.408 37.292 -35.805 3.227
629 null <NA> 080330_224155 9.866 -130.138 -1.565 -12.779 37.120 -33.067 3.227
631 null <NA> 080330_223858 0.224 -134.052 -9.971 -1.178 38.019 -3.179 3.644
sz_bot hit_distance_sc launch_speed launch_angle effective_speed release_spin_rate
626 1.508 NA NA NA NA NA
627 1.508 NA NA NA NA NA
628 1.508 NA NA NA NA NA
629 1.508 NA NA NA NA NA
631 1.724 NA NA NA NA NA
release_extension game_pk pitcher_1 fielder_2_1 fielder_3 fielder_4 fielder_5
626 NA 233759 493247 null null null null
627 NA 233759 493247 null null null null
628 NA 233759 493247 null null null null
629 NA 233759 493247 null null null null
631 NA 233759 400010 null null null null
fielder_6 fielder_7 fielder_8 fielder_9 release_pos_y estimated_ba_using_speedangle
626 null null null null NA NA
627 null null null null NA NA
628 null null null null NA NA
629 null null null null NA NA
631 null null null null NA NA
estimated_woba_using_speedangle woba_value woba_denom babip_value iso_value
626 NA 0 NA 0 0
627 NA NA NA NA NA
628 NA NA NA NA NA
629 NA NA NA NA NA
631 NA 0 NA 0 0
launch_speed_angle at_bat_number pitch_number pitch_name home_score away_score
626 NA 62 5 Sinker 2 2
627 NA 62 4 Sinker 2 2
628 NA 62 3 Sinker 2 2
629 NA 62 2 Sinker 2 2
631 NA 61 4 4-Seam Fastball 2 2
bat_score fld_score post_away_score post_home_score post_bat_score post_fld_score
626 2 2 2 2 2 2
627 2 2 2 2 2 2
628 2 2 2 2 2 2
629 2 2 2 2 2 2
631 2 2 2 2 2 2
if_fielding_alignment of_fielding_alignment barrel pitch_key
626 null null NA 493247-SI
627 null null NA 493247-SI
628 null null NA 493247-SI
629 null null NA 493247-SI
631 null null NA 400010-FF
>