有效地重新称量数据集后如何循环功能

时间:2019-05-22 16:37:55

标签: r function loops dataframe purrr

很抱歉,可能不清楚的地方。

我一直在尝试运行来自ballerker软件包(https://rdrr.io/github/BillPetti/baseballr/src/R/run_expectancy_code.R)的运行期望函数的循环,该循环为我拥有的10,091个唯一音高中的每一个计算159个可能状态下的运行值。但是我希望函数不根据孤立的音调来计算,而是根据所有音调组合后不断重新加权的数据帧进行计算。因此,对于任何给定的唯一音高,我都希望复制与该音高有关的行,以使其占最终数据帧的20%,然后我希望将音高(排名第一)的音高(并将此数据存储在一个单独的数据帧中,称为音高相似度)中-5在相似度中占另外20%,在#6-10中占10%,在11-30中占5%,其余音高则占50%。这仍然是实验性的,但是我想从这里开始。

我认为在设置权重方面取得了一些进展,但我不认为我一直在复制行,而且我最努力的是使此过程成为一个恒定的循环,以便每个唯一的音高在重新加权主数据帧后,可以计算其运行期望值。我也很难将所有的运行期望数据都显示在一个数据帧中–我不确定是否可以为该数据计算函数,然后重新称重,然后再次计算,依此类推,直到为所有螺距循环为止,所以我得到一个大数据框,最重要的是第三列,该列列出了用作混合主要音调的唯一音高,因此我可以区分。请注意,lefty_abs是主要数据帧,所有行均用于计算运行期望值,pitch_similarity是我的数据帧,其中所有音高最相似,相似度最高。为了方便起见,我将所有内容的dput样本的粘贴框放在最后。我还添加了一个音调数据框,其中仅列出了唯一的音调。

这是运行预期功能,以及我的重新加权尝试

library(dplyr)

run_expectancy_code <- function(df, level = "plate appearance") {

  single_outs <- c("strikeout", "caught_stealing_2b",
                   "pickoff_caught_stealing_2b", "other_out",
                   "caught_stealing_3b", "caught_stealing_home",
                   "field_out", "force_out", "pickoff_1b",
                   "batter_interference", "fielders_choice",
                   "pickoff_2b", "pickoff_caught_stealing_3b",
                   "pickoff_caught_stealing_home")
  df <- df %>%
    dplyr::arrange(game_pk, at_bat_number, pitch_number) %>%
    dplyr::group_by(game_pk) %>%
    dplyr::mutate(final_pitch_game =
                    ifelse(pitch_number == max(pitch_number), 1, 0)) %>%
    dplyr::ungroup() %>%
    dplyr::group_by(game_pk, at_bat_number, inning_topbot) %>%
    dplyr::mutate(final_pitch_at_bat = ifelse(pitch_number == max(pitch_number), 1, 0)) %>%
    dplyr::ungroup()

  df <- df %>%
    dplyr::arrange(game_pk, inning_topbot, at_bat_number, pitch_number) %>%
    dplyr::mutate(runs_scored_on_pitch = stringr::str_count(des, "scores"),
                  runs_scored_on_pitch =
                    ifelse(events == "home_run", runs_scored_on_pitch + 1,
                           runs_scored_on_pitch),
                  bat_score_after = bat_score + runs_scored_on_pitch) %>%
    dplyr::arrange(game_pk, at_bat_number, pitch_number) %>%
    dplyr::mutate(final_pitch_inning =
                    ifelse(final_pitch_at_bat == 1 &
                             inning_topbot != lead(inning_topbot), 1, 0),
                  final_pitch_inning = ifelse(is.na(final_pitch_inning),
                                              1, final_pitch_inning))

  if (level == "plate appearance") {
    df <- df %>%
      dplyr::group_by(game_pk, inning, inning_topbot) %>%
      dplyr::mutate(bat_score_start_inning = min(bat_score),
                    bat_score_end_inning = max(bat_score),
                    cum_runs_in_inning = cumsum(runs_scored_on_pitch),
                    runs_to_end_inning = bat_score_end_inning - bat_score) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(base_out_state = paste(outs_when_up, " outs, ",
                                           ifelse(!is.na(.$on_1b), "1b", "_"),
                                           ifelse(!is.na(.$on_2b), "2b", "_"),
                                           ifelse(!is.na(.$on_3b), "3b", "_")))

    re_table <- run_expectancy_table(df)

    df <- df %>%
      left_join(re_table, by = "base_out_state")

    df <- df %>%
      dplyr::filter(final_pitch_at_bat == 1) %>%
      dplyr::arrange(game_pk, inning, inning_topbot) %>%
      dplyr::group_by(game_pk, inning, inning_topbot) %>%
      dplyr::mutate(next_base_out_state = dplyr::lead(base_out_state)) %>%
      dplyr::ungroup() %>%
      dplyr::left_join(re_table,
                       by = c("next_base_out_state" = "base_out_state")) %>%
      dplyr::rename(next_avg_re = avg_re.y,
                    avg_re = avg_re.x) %>%
      dplyr::mutate(next_avg_re = ifelse(is.na(next_avg_re), 0, next_avg_re),
                    change_re = next_avg_re - avg_re,
                    re24 = change_re + runs_scored_on_pitch) %>%
      dplyr::arrange(game_pk, inning, inning_topbot)
  } else {
    df <- df %>%
      dplyr::group_by(game_pk, inning, inning_topbot) %>%
      dplyr::mutate(bat_score_start_inning = min(bat_score),
                    bat_score_end_inning = max(bat_score),
                    cum_runs_in_inning = cumsum(runs_scored_on_pitch),
                    runs_to_end_inning = bat_score_end_inning - bat_score) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(count_base_out_state =
                      paste(balls, "-", strikes, ", ",
                            outs_when_up, " outs, ",
                            ifelse(!is.na(.$on_1b), "1b", "_"),
                            ifelse(!is.na(.$on_2b), "2b", "_"),
                            ifelse(!is.na(.$on_3b), "3b", "_")))

    re_table <- run_expectancy_table(df, level = "pitch")

    df <- df %>%
      left_join(re_table, by = "count_base_out_state")

    df <- df %>%
      #dplyr::filter(final_pitch_at_bat == 1) %>%
      dplyr::arrange(game_pk, inning, inning_topbot) %>%
      dplyr::group_by(game_pk, inning, inning_topbot) %>%
      dplyr::mutate(next_count_base_out_state =
                      dplyr::lead(count_base_out_state)) %>%
      dplyr::ungroup() %>%
      dplyr::left_join(re_table,
                       by = c("next_count_base_out_state" =
                                "count_base_out_state")) %>%
      dplyr::rename(next_avg_re = avg_re.y,
                    avg_re = avg_re.x) %>%
      dplyr::mutate(next_avg_re = ifelse(is.na(next_avg_re), 0, next_avg_re),
                    change_re = next_avg_re - avg_re,
                    re24 = change_re + runs_scored_on_pitch) %>%
      dplyr::arrange(game_pk, inning, inning_topbot)
  }

  assign("run_expectancy_state_table", re_table, envir = .GlobalEnv)

  df

}


run_expectancy_table <- function(df, level = "plate appearance") {

  if (level == "plate appearance") {

    df <- df %>%
      dplyr::filter(final_pitch_at_bat == 1, inning < 9) %>%
      dplyr::group_by(base_out_state) %>%
      dplyr::summarise(avg_re = mean(runs_to_end_inning, na.rm = TRUE)) %>%
      dplyr::arrange(desc(avg_re))
  } else {
    df <- df %>%
      dplyr::filter(inning < 9) %>%
      dplyr::group_by(count_base_out_state) %>%
      dplyr::summarise(avg_re = mean(runs_to_end_inning, na.rm = TRUE)) %>%
      dplyr::arrange(desc(avg_re))
  }
  df
}




# reweighting below

{twenty<-nrow(lefty_abs)*0.20

samedf<-lefty_abs[lefty_abs$pitch_key%in%pitch_similarity$pitch_1,]

if (twenty<=nrow(samedf)) {
  twentydf<-samedf[1:twenty,]
} else {
  twentydf<-matrix(nrow = twenty,ncol = ncol(lefty_abs))
twentydf<-lefty_abs[1:twenty,]
number<-floor(twenty/nrow(samedf))

a<-1
b<-nrow(samedf)
c<-b

for (i in 1:number) {

  twentydf[a:c,]<-samedf

  a<-a+b
  c<-c+b
} 

twentydf[a:twenty,]<-samedf[1:(twenty-a+1),]

}

onefive<-pitch_similarity[pitch_similarity$rank>=1 & pitch_similarity$rank<=5,]
same2df<-lefty_abs[lefty_abs$pitch_key%in%onefive$pitch_2,]

if (twenty<=nrow(same2df)) {
  twenty2df<-same2df[1:twenty,]
} else {
  twenty2df<-lefty_abs[1:twenty,]
  number2<-floor(twenty/nrow(same2df))

d<-1
e<-nrow(same2df)  
f<-e

  for (i in 1:number2) {

    twenty2df[d:f,]<-same2df

    d<-d+e
    f<-f+e
  } 

  twenty2df[d:twenty,]<-same2df[1:(twenty-d+1),]

}

sixten<-pitch_similarity[pitch_similarity$rank>=6 & pitch_similarity$rank<=10,]
same3df<-lefty_abs[lefty_abs$pitch_key%in%sixten$pitch_2,]

if (twenty/4<=nrow(same3df)) {
  twenty3df<-same3df[1:(twenty/4),]
} else {

  twenty3df<-lefty_abs[1:(twenty/4),]
  number3<-floor(twenty/4/nrow(same3df))

  g<-1
  h<-nrow(same3df)  
  j<-h

  for (i in 1:number3) {

    twenty3df[g:j,]<-same3df

    g<-g+h
    j<-j+h
  } 

  twenty3df[g:(twenty/4),]<-same3df[1:(twenty/4-g+1),]
  twenty3df<-twenty3df[1:(twenty/4),]

}

eleven<-pitch_similarity[pitch_similarity$rank>=11 & pitch_similarity$rank<=30,]
same4df<-lefty_abs[lefty_abs$pitch_key%in%eleven$pitch_2,]

if (twenty/4<=nrow(same4df)) {
  twenty4df<-same4df[1:twenty/4,]
} else {

  twenty4df<-lefty_abs[1:(twenty/4),]
  number4<-floor(twenty/4/nrow(same4df))

  k<-1
  l<-nrow(same4df)  
  m<-l

  for (i in 1:number4) {

    twenty4df[k:m,]<-same4df

    k<-k+l
    m<-m+l
  } 

  twenty4df[k:(twenty/4),]<-same4df[1:(twenty/4-k+1),]
  twenty4df<-twenty4df[1:(twenty/4),]

}


   newdf<-rbind.data.frame(twentydf,twenty2df,twenty3df,twenty4df)

    rest<-lefty_abs[!lefty_abs$pitch_key %in% newdf$pitch_key,]
    rest<-rest[1:(nrow(lefty_abs)/2),]
  final<-rbind.data.frame(newdf,rest)

  run_expectancy_code(final, level = "pitch")  
  }



采样采样

structure(list(sample_pitches = structure(c(9L, 8L, 10L, 7L, 
1L, 2L, 4L, 6L, 3L, 5L), .Label = c("150037-FF", "218596-FF", 
"218596-FS", "218596-SI", "346800-FC", "346800-FF", "349193-FT", 
"400010-FF", "493247-SI", "493247-SL"), class = "factor")), row.names = c(NA, 
-10L), class = "data.frame")

音高相似度采样头

> head(sample_similarity)
    pitch_1   pitch_2 euclid_dist rank batter_handedness
1 493247-SI 434958-SI   0.4343468    4                 L
2 493247-SI 503285-SI   0.6632168   13                 L
3 493247-SI 448592-SI   0.7847100   20                 L
4 493247-SI 434958-FF   0.7659369   17                 L
5 493247-SI 501925-SI   0.2812055    1                 L

样本左撇子的头部,因为即使dput也会很大-

> head(sample_lefty)
    pitch_type  game_date release_speed release_pos_x release_pos_z  player_name batter
626         SI 2008-03-30          91.9            NA            NA Peter Moylan 150217
627         SI 2008-03-30          91.0            NA            NA Peter Moylan 150217
628         SI 2008-03-30          91.6            NA            NA Peter Moylan 150217
629         SI 2008-03-30          89.6            NA            NA Peter Moylan 150217
631         FF 2008-03-30          92.3            NA            NA    Jon Rauch 435263
    pitcher    events   description spin_dir spin_rate_deprecated break_angle_deprecated
626  493247 strikeout      foul_tip     <NA>                 <NA>                   <NA>
627  493247      null          foul     <NA>                 <NA>                   <NA>
628  493247      null called_strike     <NA>                 <NA>                   <NA>
629  493247      null          ball     <NA>                 <NA>                   <NA>
631  400010 field_out hit_into_play     <NA>                 <NA>                   <NA>
    break_length_deprecated zone                                                   des
626                    <NA>    7            Cristian Guzman strikes out on a foul tip.
627                    <NA>    8                                                  null
628                    <NA>    5                                                  null
629                    <NA>   13                                                  null
631                    <NA>    4 Brian McCann flies out to left fielder Willie Harris.
    game_type stand p_throws home_team away_team type hit_location  bb_type balls
626         R     L        R       WSH       ATL    S            2     null     2
627         R     L        R       WSH       ATL    S         null     null     2
628         R     L        R       WSH       ATL    S         null     null     2
629         R     L        R       WSH       ATL    B         null     null     1
631         R     L        R       WSH       ATL    X            7 fly_ball     3
    strikes game_year      pfx_x        pfx_z plate_x plate_z on_3b on_2b on_1b
626       2      2008 -1.1290775 -0.004106667  -0.402   1.789    NA    NA    NA
627       1      2008 -1.0738283  0.049930000  -0.014   1.658    NA    NA    NA
628       0      2008 -0.9675050 -0.107020000   0.203   2.070    NA    NA    NA
629       0      2008 -1.2086808  0.115863333  -0.724   1.017    NA    NA    NA
631       0      2008 -0.2232417  2.557116667  -0.742   2.725    NA    NA    NA
    outs_when_up inning inning_topbot  hc_x   hc_y tfs_deprecated tfs_zulu_deprecated
626            0      9           Bot    NA     NA           <NA>                <NA>
627            0      9           Bot    NA     NA           <NA>                <NA>
628            0      9           Bot    NA     NA           <NA>                <NA>
629            0      9           Bot    NA     NA           <NA>                <NA>
631            2      9           Top 62.25 103.41           <NA>                <NA>
    fielder_2 umpire         sv_id    vx0      vy0    vz0      ax     ay      az sz_top
626      null   <NA> 080330_224253 11.150 -133.360 -0.139 -12.709 33.238 -34.621  3.227
627      null   <NA> 080330_224229 11.203 -132.113 -0.270 -11.476 39.266 -33.872  3.227
628      null   <NA> 080330_224213 11.329 -132.914  1.120 -10.408 37.292 -35.805  3.227
629      null   <NA> 080330_224155  9.866 -130.138 -1.565 -12.779 37.120 -33.067  3.227
631      null   <NA> 080330_223858  0.224 -134.052 -9.971  -1.178 38.019  -3.179  3.644
    sz_bot hit_distance_sc launch_speed launch_angle effective_speed release_spin_rate
626  1.508              NA           NA           NA              NA                NA
627  1.508              NA           NA           NA              NA                NA
628  1.508              NA           NA           NA              NA                NA
629  1.508              NA           NA           NA              NA                NA
631  1.724              NA           NA           NA              NA                NA
    release_extension game_pk pitcher_1 fielder_2_1 fielder_3 fielder_4 fielder_5
626                NA  233759    493247        null      null      null      null
627                NA  233759    493247        null      null      null      null
628                NA  233759    493247        null      null      null      null
629                NA  233759    493247        null      null      null      null
631                NA  233759    400010        null      null      null      null
    fielder_6 fielder_7 fielder_8 fielder_9 release_pos_y estimated_ba_using_speedangle
626      null      null      null      null            NA                            NA
627      null      null      null      null            NA                            NA
628      null      null      null      null            NA                            NA
629      null      null      null      null            NA                            NA
631      null      null      null      null            NA                            NA
    estimated_woba_using_speedangle woba_value woba_denom babip_value iso_value
626                              NA          0         NA           0         0
627                              NA         NA         NA          NA        NA
628                              NA         NA         NA          NA        NA
629                              NA         NA         NA          NA        NA
631                              NA          0         NA           0         0
    launch_speed_angle at_bat_number pitch_number      pitch_name home_score away_score
626                 NA            62            5          Sinker          2          2
627                 NA            62            4          Sinker          2          2
628                 NA            62            3          Sinker          2          2
629                 NA            62            2          Sinker          2          2
631                 NA            61            4 4-Seam Fastball          2          2
    bat_score fld_score post_away_score post_home_score post_bat_score post_fld_score
626         2         2               2               2              2              2
627         2         2               2               2              2              2
628         2         2               2               2              2              2
629         2         2               2               2              2              2
631         2         2               2               2              2              2
    if_fielding_alignment of_fielding_alignment barrel pitch_key
626                  null                  null     NA 493247-SI
627                  null                  null     NA 493247-SI
628                  null                  null     NA 493247-SI
629                  null                  null     NA 493247-SI
631                  null                  null     NA 400010-FF
>

0 个答案:

没有答案