我正在做一些研究,需要我在几个单独的df
中进行分析。单独的df
的结果将在一个主要计算df
中使用。大多数统计信息类别都在多个df
中使用。我希望使用来自单独的df
# Table used for league average calculations below
teams <- tibble::tribble(
~Team, ~PA, ~AB, ~H, ~HBP, ~BB, ~HR, ~R, ~RBI, ~SB, ~AVG, ~OBP,
"Athletics", 6255, 5579, 1407, 76, 550, 227, 813, 778, 35, 0.252, 0.325,
"Red Sox", 6302, 5623, 1509, 55, 569, 208, 876, 829, 125, 0.268, 0.339,
"Yankees", 6271, 5515, 1374, 62, 625, 267, 851, 821, 63, 0.249, 0.329,
"Indians", 6300, 5595, 1447, 80, 554, 216, 818, 786, 135, 0.259, 0.332,
"Astros", 6146, 5453, 1390, 61, 565, 205, 797, 763, 71, 0.255, 0.329
)
# Table used for player calculations (main table)
players <- tibble::tribble(
~Name, ~Team, ~G, ~PA, ~AB, ~H, ~HBP, ~BB, ~HR, ~R, ~RBI, ~SB, ~AVG, ~OBP,
"Mookie Betts", "Red Sox", 136, 614, 520, 180, 8, 81, 32, 129, 80, 30, 0.346, 0.438,
"Mike Trout", "Angels", 140, 608, 471, 147, 10, 122, 39, 101, 79, 24, 0.312, 0.46,
"J.D. Martinez", "Red Sox", 150, 649, 569, 188, 4, 69, 43, 111, 130, 6, 0.33, 0.402,
"Alex Bregman", "Astros", 157, 705, 594, 170, 12, 96, 31, 105, 103, 10, 0.286, 0.394,
"Jose Ramirez", "Indians", 157, 698, 578, 156, 8, 106, 39, 110, 105, 34, 0.27, 0.387
)
# Denominators needed for calculations
calc_tbl <- tibble::tribble(
~data_col, ~calc_denom,
"HR", 14.3,
"R", 19.6,
"RBI", 17.5,
"SB", 26.2,
"AVG", 0.0045,
"OBP", 0.0031
) %>%
spread(key = data_col, value = "calc_denom")
# Get league average of teams
lg_avg <- teams %>%
# Divide counting stats by 10 to get the averages for 10 batters
mutate_at(vars(PA:SB), funs(./10)) %>%
summarize_if(is.numeric, mean, na.rm=TRUE)
lg_avg
#> # A tibble: 1 x 11
#> PA AB H HBP BB HR R RBI SB AVG OBP
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 625. 555. 143. 6.68 57.3 22.5 83.1 79.5 8.58 0.257 0.331
# Calculate Values
value_tbl <- players %>%
mutate(calc_R = R / calc_tbl$R,
calc_HR = HR / calc_tbl$HR,
calc_RBI = RBI / calc_tbl$RBI,
calc_SB = SB / calc_tbl$SB,
calc_BA = (((lg_avg$H * 13 ) + H)/(AB + (lg_avg$AB * 13)) - lg_avg$AVG) / calc_tbl$AVG,
calc_Total = (calc_R + calc_HR + calc_RBI + calc_SB + calc_BA))
我确实有2个问题,都集中在效率上,以及是否有更好的方法来做我正在做的事情。我可以正确调用其他df
列计算的结果吗?
是否有更直接,更有效的方式在mutate上编写最后一段代码?
答案 0 :(得分:0)
查看此方法是否适合您(使用.xlsx
):
data.table
部分结果:
# load packages
library(data.table)
# Table used for league average calculations below ----
teams <- tibble::tribble(
~Team, ~PA, ~AB, ~H, ~HBP, ~BB, ~HR, ~R, ~RBI, ~SB, ~AVG, ~OBP,
"Athletics", 6255, 5579, 1407, 76, 550, 227, 813, 778, 35, 0.252, 0.325,
"Red Sox", 6302, 5623, 1509, 55, 569, 208, 876, 829, 125, 0.268, 0.339,
"Yankees", 6271, 5515, 1374, 62, 625, 267, 851, 821, 63, 0.249, 0.329,
"Indians", 6300, 5595, 1447, 80, 554, 216, 818, 786, 135, 0.259, 0.332,
"Astros", 6146, 5453, 1390, 61, 565, 205, 797, 763, 71, 0.255, 0.329
)
setDT(teams) # set df as data.table
# Table used for player calculations (main table) -----
players <- tibble::tribble(
~Name, ~Team, ~G, ~PA, ~AB, ~H, ~HBP, ~BB, ~HR, ~R, ~RBI, ~SB, ~AVG, ~OBP,
"Mookie Betts", "Red Sox", 136, 614, 520, 180, 8, 81, 32, 129, 80, 30, 0.346, 0.438,
"Mike Trout", "Angels", 140, 608, 471, 147, 10, 122, 39, 101, 79, 24, 0.312, 0.46,
"J.D. Martinez", "Red Sox", 150, 649, 569, 188, 4, 69, 43, 111, 130, 6, 0.33, 0.402,
"Alex Bregman", "Astros", 157, 705, 594, 170, 12, 96, 31, 105, 103, 10, 0.286, 0.394,
"Jose Ramirez", "Indians", 157, 698, 578, 156, 8, 106, 39, 110, 105, 34, 0.27, 0.387
)
setDT(players) # set df as data.table
# Denominators needed for calculations----
calc_tbl <- tibble::tribble(
~data_col, ~calc_denom,
"HR", 14.3,
"R", 19.6,
"RBI", 17.5,
"SB", 26.2,
"AVG", 0.0045,
"OBP", 0.0031
)
setDT(calc_tbl) # set df as data.table
# Get league average of teams ----
lg_avg <- teams[, lapply(.SD, mean, na.rm = T), .SDcols = which(sapply(teams, is.numeric))] # summarize by columns that are numeric
# Calculate Values
cols <- names(players)[-c(1:2)] # assign an object with the column names to be calculated
res <- copy(players) # optional: I am making a copy of "players" because the `:=` operator changes the data by reference. If don't need to preserve the players table, then you don't need to make a copy, replace res in the loop and in the data.table::melt(...) expression by "players".
for(i in cols){
if(i == "OBP"){
res[, (i) := (lg_avg$OBP * lg_avg$PA) + H + BB]
next
}
res[, (i) := lapply(.SD, function(x) {
if(is.null(lg_avg[[i]])) return(NA)
return(x/lg_avg[[i]])
}), .SDcols= i]
}
res <- data.table::melt(res, id.vars = c(1:2), variable.name = "stat_value", value.name = "calc_column")
答案 1 :(得分:0)
除非我对您的问题有误解,看来您只需要基于data_col
的值应用其他公式即可?那么,为什么不直接在您的if_else
通话中添加一个mutate
,例如
已更新
value_tbl <- players %>%
gather(key = data_col, value = "stat_value", -c(Name, Team)) %>%
left_join(calc_tbl, by = "data_col") %>%
# Join on players table and drop Team so there's no column duplication
left_join(players %>% select(-Team), by = "Name") %>%
mutate(calc_column = if_else(data_col == 'OBP', lg_avg$OBP * lg_avg$PA + H + BB ,stat_value / calc_denom))