改善不良/可能不必要的性能在R

时间:2019-05-03 19:29:42

标签: r performance apply

在此先感谢您的帮助。我不确定我使用的是apply是错误的,还是只是违反了其他使我的代码变慢的规则。任何帮助表示赞赏。

概述:我有篮球数据,其中每一行都是篮球比赛中的时刻,包括场上的10名球员,他们的球队,比赛以及比赛的时间(1-40)该行所在的位置。使用这些数据,我为每个球员计算他们在1分钟至40分钟内每次在场上的比赛百分比。

例如,如果乔的球队出战20场比赛,并且如果在其中的13场比赛中,乔在比赛的第5分钟被发现在数据中,那么我们可以说乔在65的第5分钟被发现在场上他球队比赛的百分比。我正在用不太小的数据为每个球员,每个赛季,每1-40分钟计算一次,并且遇到性能问题。这是我目前执行此操作的功能:

library(dplyr)

# Raw Data Is Play-By-Play Data - Each Row contains stats for a pl (combination of 5 basketball players)
sheets_url <- 'https://docs.google.com/spreadsheets/d/1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o/export?format=csv&id=1xmzaF6tpzVpjOmgfwHwFM_JE8LUszofjj25A5P0P21o&gid=630752085'
on.ct.data <- httr::content(httr::GET(url = sheets_url))

computeOnCourtByMinutePcts <- function(on.ct.data) {

  # Create Dataframe With Number Of Games Played By Team Each Season
  num.home.team.games <- on.ct.data %>%
    dplyr::group_by(homeTeamId, season) %>%
    dplyr::summarise(count = length(unique(gameId)))

  num.away.team.games <- on.ct.data %>%
    dplyr::group_by(awayTeamId, season) %>%
    dplyr::summarise(count = length(unique(gameId)))

  num.team.games <- num.home.team.games %>%
    dplyr::full_join(num.away.team.games, by = c('homeTeamId'='awayTeamId', 'season'='season')) %>%
    dplyr::mutate(gamesPlayed = rowSums(cbind(count.x, count.y), na.rm = TRUE)) %>%
    dplyr::rename(teamId = homeTeamId) %>%
    dplyr::mutate(season = as.character(season)) %>%
    dplyr::select(teamId, season, gamesPlayed)

  # Create Dataframe With Players By Season - Seems kind of bulky as well
  all.player.season.apperances <- rbind(
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId1, season) %>% dplyr::rename(playerId = onCtHomeId1, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId2, season) %>% dplyr::rename(playerId = onCtHomeId2, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId3, season) %>% dplyr::rename(playerId = onCtHomeId3, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId4, season) %>% dplyr::rename(playerId = onCtHomeId4, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(homeTeamId, onCtHomeId5, season) %>% dplyr::rename(playerId = onCtHomeId5, teamId = homeTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId1, season) %>% dplyr::rename(playerId = onCtAwayId1, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId2, season) %>% dplyr::rename(playerId = onCtAwayId2, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId3, season) %>% dplyr::rename(playerId = onCtAwayId3, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId4, season) %>% dplyr::rename(playerId = onCtAwayId4, teamId = awayTeamId),
    on.ct.data %>% dplyr::select(awayTeamId, onCtAwayId5, season) %>% dplyr::rename(playerId = onCtAwayId5, teamId = awayTeamId)) %>%
    dplyr::distinct(teamId, playerId, season) %>%
    dplyr::filter(!is.na(playerId))

  # For Each Player-Season, Compute Number Of Games On Court at each minute in game - this is the bad Apply
  playing.time.breakdowns <- apply(X = all.player.season.apperances, MARGIN = 1, FUN = function(thisRow) {

    # Set Player / Season Variables
    thisPlayerId = thisRow[2]
    thisSeason = thisRow[3]

    # Filter for each unique minute of each game with this player on court
    on.court.df = on.ct.data %>% 
      dplyr::filter(onCtHomeId1 == thisPlayerId | onCtHomeId2 == thisPlayerId | onCtHomeId3 == thisPlayerId | onCtHomeId4 == thisPlayerId | onCtHomeId5 == thisPlayerId |
                      onCtAwayId1 == thisPlayerId | onCtAwayId2 == thisPlayerId | onCtAwayId3 == thisPlayerId | onCtAwayId4 == thisPlayerId | onCtAwayId5 == thisPlayerId) %>%
      dplyr::filter(season == thisSeason) %>%
      dplyr::filter(!duplicated(paste0(gameId, minNumIntoGame)))

    # Turn This Into a table of minutes on court by game
    thisTable <- table(on.court.df$minNumIntoGame)

    this.player.distrubution.df <- data.frame(
      playerId = thisRow[2],
      teamId = thisRow[1],
      season = thisRow[3],
      minNumIntoGame = as.integer(names(thisTable)),
      numGamesAtMinNum = unname(thisTable) %>% as.vector(),
      stringsAsFactors = FALSE
    )

    # 40 minutes in basketball game, so previous dataframe needs 40 rows
    if(length(which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame))) > 0) {
      zero.mins.played.df <- data.frame(
        playerId = thisRow[2],
        teamId = thisRow[1],
        season = thisRow[3],
        minNumIntoGame = which(!(1:40 %in% this.player.distrubution.df$minNumIntoGame)),
        numGamesAtMinNum = 0,
        stringsAsFactors = FALSE
      )

      this.player.distrubution.df <- plyr::rbind.fill(this.player.distrubution.df, zero.mins.played.df) %>% dplyr::arrange(minNumIntoGame)
    }

    # and return
    return(this.player.distrubution.df)
  })

  # Combine the output into one dataframe
  playing.time.breakdowns <- playing.time.breakdowns %>% do.call("rbind", .)

  # Join on Team-Games played
  playing.time.breakdowns <- playing.time.breakdowns %>%
    dplyr::left_join(num.team.games, by = c("teamId"="teamId", "season"="season")) %>%
    dplyr::rename(teamGamesPlayed = gamesPlayed)

  # Compute pct of games played
  playing.time.breakdowns <- playing.time.breakdowns %>%
    dplyr::mutate(pctMinNumPlayed = round(numGamesAtMinNum / teamGamesPlayed, 3))

  # Handle OT (minNumIntoGame > 40) needs a lower gamesPlayed denominator...

  # And Return
  return(playing.time.breakdowns);
}
on.ct.by.min <- computeOnCourtByMinutePcts(on.ct.data)

总而言之,代码执行以下操作:

  1. 创建所有唯一的球员赛季和球队赛季的初始数据框。对于团队赛季,请使用pbp数据来计算比赛次数。
  2. 应用-对于每个球员赛季:(a)在每场比赛的每一分钟内找到球员在场上的每个实例(在10 onCt列之一中),(b)将其转换为该表显示了玩家在1-40分钟内每次在场上的比赛次数。
  3. 抛光并返回。将几张表连接在一起,然后计算相关百分比。

请注意,通过手动运行apply的一行all.player.season.appearances可能会更容易。将thisRow设置为数据帧中的任何行,然后逐行运行代码以使内容更加清晰。

为了突出显示慢速代码问题,我已经将大量的按次播放/场上数据上传到Google表格,将其公开,并在上面的代码中包含了加载数据的链接。 Google表格有约1/2的当前数据,但是在不久的将来,我的总数据量预计将增加10倍,并且该代码目前需要约8分钟才能在我的计算机上运行。这是一个脚本,需要每天且相当快地运行,而我花了80分钟才能承担这个功能。

感觉我的apply()调用没有完成,好像没有比普通的for循环快。我不确定完全不需要申请,实际上,我认为不是。但是在过去的24小时里,我一直在苦苦思索如何改善此功能,但没有运气。这里一定有更好的方法!

编辑:可重现的示例中有一个小错误,目前正在研究中。 Edit2:修复了在num.team.games数据框中创建NA的问题。我只是运行了代码,它似乎工作正常。大约有600行输出,其中teamId为NA,这没什么好担心的。

Edit3:看起来,每次应用迭代都需要0.06秒,并且数据帧中有5312行,总计运行时间约为8分钟。我应该尝试将0.06减小到<0.01,还是放弃整个方法?这是我不确定的主要问题...

1 个答案:

答案 0 :(得分:1)

我认为可以通过将数据转换为长格式并计算球员-分钟-球队-赛季的组合来更简单地实现。 (从2008年开始,在这台旧计算机上运行大约需要5秒钟,这是大部分计算。)

library(tidyverse)
on.ct.data %>%
  gather(spot, name, onCtHomeId1:onCtAwayId5) %>%
  mutate(team = if_else(spot %>% str_detect("Away"),
                        awayTeamId, homeTeamId)) %>%
  select(-spot) %>%  # For this part, I only care about person and minute of game.
  distinct() %>%  # Drop dupes and instances where they were repositioned within one minute.
  drop_na()  %>%
  select(-c(gameId:awayTeamId)) %>%
  count(minNumIntoGame, name, team, season)

# A tibble: 140,581 x 5
   minNumIntoGame name              team  season     n
            <dbl> <chr>             <chr>  <dbl> <int>
 1              1 AahmaneSantos387c JAC     1819     1
 2              1 AamirSimmseef9    CLEM    1819    13
 3              1 AarenEdmead9cd6   NCAT    1718     1
 4              1 AarenEdmead9cd6   NCAT    1819     1
 5              1 AaronBrennanbee2  IUPU    1718     1
 6              1 AaronCalixtea11d  OKLA    1819    11
 7              1 AaronCarver9cfa   ODU     1819     2
 8              1 AaronClarke3d67   SHU     1819     1
 9              1 AaronFalzon213b   NW      1718     1
10              1 AaronHolidayfce6  UCLA    1718    11

现在有了这个,我们可以检查每个团队的游戏世界是什么样的。每个团队在给定的分钟内每个赛季有多少场比赛?

on.ct.data.team.minutes <- on.ct.data.minute.counts %>%
  count(season, team, minNumIntoGame, gameId) %>%  
  count(season, team, minNumIntoGame) 

ggplot(on.ct.data.team.minutes %>% slice(1:1000),
       aes(minNumIntoGame, team, fill = n)) + 
  geom_tile() + facet_wrap(~season) + 
  labs(title = "# times each team played each minute (excerpt)")

enter image description here

...我们可以对每个球员做同样的事情,并与他们的球队进行比较,以了解他们每分钟为球队效力的比例。

# How many games each season did each player play a given minute for each team?
on.ct.data.player.minutes <- on.ct.data.minute.counts %>%
  count(season, team, name, minNumIntoGame) %>%
  rename(player_n = n) %>%
  left_join(on.ct.data.team.minutes) %>%
  rename(team_n = n) %>% 
  mutate(player_time = player_n / team_n)

ggplot(on.ct.data.player.minutes %>% filter(name %>% str_detect("Can")),
       aes(minNumIntoGame, player_time, color = name)) +
  geom_line() + facet_wrap(~season) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1))

enter image description here