如何使用dplyr对来自另一个表的数据进行计算?

时间:2019-01-24 15:41:58

标签: r dplyr tidyverse

我正在做一些研究,需要我在几个单独的df中进行分析。单独的df的结果将在一个主要计算df中使用。大多数统计信息类别都在多个df中使用。我希望使用来自单独的df

的结果

# Table used for league average calculations below
teams <- tibble::tribble(
  ~Team,  ~PA,  ~AB,   ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Athletics", 6255, 5579, 1407,   76, 550, 227, 813,  778,  35, 0.252, 0.325,
  "Red Sox", 6302, 5623, 1509,   55, 569, 208, 876,  829, 125, 0.268, 0.339,
  "Yankees", 6271, 5515, 1374,   62, 625, 267, 851,  821,  63, 0.249, 0.329,
  "Indians", 6300, 5595, 1447,   80, 554, 216, 818,  786, 135, 0.259, 0.332,
  "Astros", 6146, 5453, 1390,   61, 565, 205, 797,  763,  71, 0.255, 0.329
)


# Table used for player calculations (main table)
players <- tibble::tribble(
  ~Name,     ~Team,  ~G, ~PA, ~AB,  ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Mookie Betts", "Red Sox", 136, 614, 520, 180,    8,  81,  32, 129,   80,  30, 0.346, 0.438,
  "Mike Trout",  "Angels", 140, 608, 471, 147,   10, 122,  39, 101,   79,  24, 0.312,  0.46,
  "J.D. Martinez", "Red Sox", 150, 649, 569, 188,    4,  69,  43, 111,  130,   6,  0.33, 0.402,
  "Alex Bregman",  "Astros", 157, 705, 594, 170,   12,  96,  31, 105,  103,  10, 0.286, 0.394,
  "Jose Ramirez", "Indians", 157, 698, 578, 156,    8, 106,  39, 110,  105,  34,  0.27, 0.387
)


# Denominators needed for calculations
calc_tbl <- tibble::tribble(
  ~data_col, ~calc_denom,
  "HR",        14.3,
  "R",        19.6,
  "RBI",        17.5,
  "SB",        26.2,
  "AVG",      0.0045,
  "OBP",      0.0031
) %>% 
  spread(key = data_col, value = "calc_denom")


# Get league average of teams
lg_avg <- teams %>% 
# Divide counting stats by 10 to get the averages for 10 batters
  mutate_at(vars(PA:SB), funs(./10)) %>% 
  summarize_if(is.numeric, mean, na.rm=TRUE)

lg_avg
#> # A tibble: 1 x 11
#>      PA    AB     H   HBP    BB    HR     R   RBI    SB   AVG   OBP
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  625.  555.  143.  6.68  57.3  22.5  83.1  79.5  8.58 0.257 0.331

# Calculate Values
value_tbl <- players %>% 
  mutate(calc_R = R / calc_tbl$R,
         calc_HR = HR / calc_tbl$HR,
         calc_RBI = RBI / calc_tbl$RBI,
         calc_SB = SB / calc_tbl$SB,
         calc_BA = (((lg_avg$H * 13 ) + H)/(AB + (lg_avg$AB * 13)) - lg_avg$AVG) / calc_tbl$AVG,
         calc_Total = (calc_R + calc_HR + calc_RBI + calc_SB + calc_BA))

我确实有2个问题,都集中在效率上,以及是否有更好的方法来做我正在做的事情。我可以正确调用其他df列计算的结果吗?
是否有更直接,更有效的方式在mutate上编写最后一段代码?

2 个答案:

答案 0 :(得分:0)

查看此方法是否适合您(使用.xlsx):

data.table

部分结果:

# load packages
library(data.table)

# Table used for league average calculations below ----
teams <- tibble::tribble(
  ~Team,  ~PA,  ~AB,   ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Athletics", 6255, 5579, 1407,   76, 550, 227, 813,  778,  35, 0.252, 0.325,
  "Red Sox", 6302, 5623, 1509,   55, 569, 208, 876,  829, 125, 0.268, 0.339,
  "Yankees", 6271, 5515, 1374,   62, 625, 267, 851,  821,  63, 0.249, 0.329,
  "Indians", 6300, 5595, 1447,   80, 554, 216, 818,  786, 135, 0.259, 0.332,
  "Astros", 6146, 5453, 1390,   61, 565, 205, 797,  763,  71, 0.255, 0.329
)
setDT(teams) # set df as data.table

# Table used for player calculations (main table) -----
players <- tibble::tribble(
  ~Name,     ~Team,  ~G, ~PA, ~AB,  ~H, ~HBP, ~BB, ~HR,  ~R, ~RBI, ~SB,  ~AVG,  ~OBP,
  "Mookie Betts", "Red Sox", 136, 614, 520, 180,    8,  81,  32, 129,   80,  30, 0.346, 0.438,
  "Mike Trout",  "Angels", 140, 608, 471, 147,   10, 122,  39, 101,   79,  24, 0.312,  0.46,
  "J.D. Martinez", "Red Sox", 150, 649, 569, 188,    4,  69,  43, 111,  130,   6,  0.33, 0.402,
  "Alex Bregman",  "Astros", 157, 705, 594, 170,   12,  96,  31, 105,  103,  10, 0.286, 0.394,
  "Jose Ramirez", "Indians", 157, 698, 578, 156,    8, 106,  39, 110,  105,  34,  0.27, 0.387
)
setDT(players) # set df as data.table

# Denominators needed for calculations----
calc_tbl <- tibble::tribble(
  ~data_col, ~calc_denom,
  "HR",        14.3,
  "R",        19.6,
  "RBI",        17.5,
  "SB",        26.2,
  "AVG",      0.0045,
  "OBP",      0.0031
)
setDT(calc_tbl) # set df as data.table

# Get league average of teams ----
lg_avg <- teams[, lapply(.SD, mean, na.rm = T), .SDcols = which(sapply(teams, is.numeric))] # summarize by columns that are numeric

# Calculate Values
cols <- names(players)[-c(1:2)] # assign an object with the column names to be calculated

res <- copy(players) # optional: I am making a copy of "players" because the `:=` operator changes the data by reference. If don't need to preserve the players table, then you don't need to make a copy, replace res in the loop and in the data.table::melt(...) expression by "players".

    for(i in cols){
  if(i == "OBP"){
    res[, (i) := (lg_avg$OBP * lg_avg$PA) + H + BB]
    next
  }
  res[, (i) := lapply(.SD, function(x) {
    if(is.null(lg_avg[[i]])) return(NA)
    return(x/lg_avg[[i]])
    }), .SDcols= i]
}

res <- data.table::melt(res, id.vars = c(1:2), variable.name = "stat_value", value.name = "calc_column")

答案 1 :(得分:0)

除非我对您的问题有误解,看来您只需要基于data_col的值应用其他公式即可?那么,为什么不直接在您的if_else通话中添加一个mutate,例如

已更新

value_tbl <- players %>% 
  gather(key = data_col, value = "stat_value", -c(Name, Team)) %>% 
  left_join(calc_tbl, by = "data_col") %>% 
  # Join on players table and drop Team so there's no column duplication
  left_join(players %>% select(-Team), by = "Name") %>%
  mutate(calc_column = if_else(data_col == 'OBP', lg_avg$OBP * lg_avg$PA + H + BB ,stat_value / calc_denom))