根据评分数据定期计算团队的领先优势

时间:2017-04-12 20:29:12

标签: r dplyr

我有一组来自一系列曲棍球比赛的得分数据,我陷入了分析阶段。我试图在每场比赛的每十分钟计划主队的领先优势。

以下是我到目前为止获取数据集的示例:

library(tidyverse)

# Generate example data ordered by gameid and event_ts
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60),
       team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)),
       gameid = sample(100:300, size = 1000, replace = TRUE)) %>%
  arrange(gameid, event_ts)

我知道我可以使用summarise获得每场比赛的最终得分。这是一个快速的例子,假设两支球队在每场比赛中至少得分一个目标:

game %>%
  group_by(gameid, team) %>%
  summarise(goals = n()) %>%
  spread(key = team, value = goals) %>%
  mutate(away = ifelse(is.null(away), 0, away))

我想在整个比赛中以10分钟的间隔计算主队的领先优势(正面或负面)。这需要总结到目前为止所发生的所有得分。这是我想要得到的结构的一个例子:

finished_demo <- tibble(
  gameid = sort(rep_len(seq(100, 300, 1), 1206)),
  timestamp = rep(seq(10, 60, 10), 201),
  home_lead = round(runif(
    n = 1206, min = -5, max = 7
  ))
) %>% arrange(gameid, timestamp)

4 个答案:

答案 0 :(得分:4)

这是使用 data.table ,IIUC完成它的一种方法:

require(data.table)
setDT(game) # generated with op's code but with a seed(1L)

key <- CJ(gameid=unique(game$gameid), start=1L, end=(1:6)*10L)
ans <- game[key, on=.(gameid, event_ts >= start, event_ts <= end),  # (1)
              .(home_lead=sum(team == "home")-sum(team == "away")), # (2)
              by=.EACHI]                                            # (3)

head(ans)
#    gameid event_ts event_ts home_lead
# 1:    100        1       10        NA
# 2:    100        1       20         1
# 3:    100        1       30         0
# 4:    100        1       40         0
# 5:    100        1       50        -1
# 6:    100        1       60        -2

您可以重命名重复的列名称(当我有时间处理它时,我会解决此问题。)

(1)搜索game中与key中每一行匹配的行索引,同时匹配on参数下提供的条件。

(2)计算home团队的领导。

(3).EACHI通知我应该为game的每一行key的匹配行计算主队领导。

NA表示没有匹配事件..如有必要,可以通过执行以下操作将其替换为0

ans[is.na(home_lead), home_lead := 0L]

答案 1 :(得分:1)

这个怎么样?

game %>% 
mutate(ten_min = event_ts %/% 10,
       homegoal = if_else(team == 'home', 1, -1)) %>% 
group_by(ten_min, gameid) %>% 
summarize(home_lead_interval = sum(homegoal)) %>% 
ungroup() %>% 
group_by(gameid) %>% 
mutate(home_lead = cumsum(home_lead_interval)) %>% 
arrange(gameid, ten_min)
# Source: local data frame [683 x 4]
# Groups: gameid [198]
# 
#    ten_min gameid home_lead_interval home_lead
#      <dbl>  <int>              <dbl>     <dbl>
# 1        0    100                  0         0
# 2        1    100                 -1        -1
# 3        2    100                 -3        -4
# 4        3    100                 -1        -5
# 5        4    100                  2        -3
# 6        5    100                 -1        -4
# 7        1    101                  1         1
# 8        2    101                  1         2
# 9        4    101                 -2         0
# 10       0    102                  1         1
# # ... with 673 more rows

答案 2 :(得分:1)

我99%肯定有人可以用purrr中的一些嵌入/嵌套(?)结构重写它。上面的结果不同nrow()(使用相同的数据),因此没有保证解决方案是正确的。

game %>%
      group_by(gameid) %>%
      do(data.frame(time = 10 * (1:(max(.$event_ts) %/% 10)))) %>%
      apply(1, function(x) {
                            g = x[1] %>% unlist
                            t = x[2] %>% unlist
                            game %>%
                              filter(gameid == g, event_ts < t) %>%
                              group_by(gameid, team) %>%
                              summarise(goals = n()) %>%
                              mutate(time = t)
                           }) %>%
      bind_rows %>%
      spread(key = team, value = goals) %>%
      mutate_all(as.numeric) %>%
      mutate(away = ifelse(is.na(away), 0, away),
             home = ifelse(is.na(home), 0, home))


   gameid  time  away  home
    <int> <dbl> <dbl> <dbl>
1     100    10     0     1
2     100    20     1     3
3     100    30     1     3
4     101    20     0     1
5     101    30     1     1
6     101    40     1     2
7     101    50     1     2

答案 3 :(得分:1)

我的想法是每10分钟获得主场和客场比分。然后,您可以根据gameid对data.frame进行分组,并创建您想要的结果。

set.seed(123)
# Generate example data ordered by gameid and event_ts
game <- tibble(event_type = "goal", event_ts = runif(n = 1000, min = 0, max = 60),
               team = sample(c("home", "away"), size = 1000, replace = TRUE, prob = c(0.55,0.45)),
               gameid = sample(100:300, size = 1000, replace = TRUE)) %>%
  arrange(gameid, event_ts)

# Change the event_ts and get all 10 minutes intervals
hl <- game %>%
  mutate(event_ts=ceiling(event_ts / 10) * 10) %>%
  dcast(gameid + event_ts ~ team, length) %>%
  right_join(expand.grid(gameid=unique(game$gameid), event_ts=seq(10, 60, 10)))
hl$away[is.na(hl$away)] <- 0
hl$home[is.na(hl$home)] <- 0
# Get the home lead
hl <- hl %>%
  arrange(gameid, event_ts) %>%
  group_by(gameid) %>%
  mutate(away=cumsum(away),
         home=cumsum(home),
         home_lead=home - away)

# Check the game 100 and 101
game %>% filter(gameid %in% 100:101)
# A tibble: 7 × 4
  event_type  event_ts  team gameid
       <chr>     <dbl> <chr>  <int>
1       goal 30.460972  home    100
2       goal 57.270219  home    100
3       goal  1.126093  home    101
4       goal 27.879957  home    101
5       goal 33.086101  home    101
6       goal 42.497419  away    101
7       goal 45.649418  home    101

hl %>% filter(gameid %in% 100:101)
Source: local data frame [12 x 5]
Groups: gameid [2]

   gameid event_ts  away  home home_lead
    <int>    <dbl> <dbl> <dbl>     <dbl>
1     100       10     0     0         0
2     100       20     0     0         0
3     100       30     0     0         0
4     100       40     0     1         1
5     100       50     0     1         1
6     100       60     0     2         2
7     101       10     0     1         1
8     101       20     0     1         1
9     101       30     0     2         2
10    101       40     0     3         3
11    101       50     1     4         3
12    101       60     1     4         3