R队伍难以聚集,分组球队篮球统计数据

时间:2016-08-08 16:37:30

标签: r

提前感谢您对此有任何帮助或建议。这是我正在使用的数据帧的缩写示例。

boxscore_stats = structure(list(game_id = c(157046L, 157046L, 157046L, 157046L, 
157046L, 157046L, 157046L, 157046L, 157046L, 157046L, 157046L, 
157046L, 157046L, 157046L, 157046L, 157046L, 157046L, 157046L, 
159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 
159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 
159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 159151L, 
159151L), team_id = c(116975, 116975, 116975, 116975, 116975, 
116975, 116975, 116975, 116975, 120310, 120310, 120310, 120310, 
120310, 120310, 120310, 120310, 120310, 121910, 121910, 121910, 
121910, 121910, 121910, 121910, 121910, 121910, 121910, 122072, 
122072, 122072, 122072, 122072, 122072, 122072, 122072, 122072, 
122072, 122072, 122072), minutes_played = c(18.76, 14.63, 8, 
16.69, 24.62, 32, 12.79, 5.28, 3.22, 24.35, 10.18, 20.65, 9.59, 
25.08, 14.12, 17.46, 23.15, 15.43, 22.84, 19.27, 21.31, 6.41, 
17.57, 17.4, 17.29, 7.22, 12.09, 17.25, 2.28, 16.87, 6.6, 19.73, 
6.31, 13.25, 26.25, 6.08, 28.71, 11.2, 17.54, 5.17), fieldGoalsMade = c(1L, 
1L, 4L, 1L, 2L, 7L, 1L, 1L, 1L, 4L, 0L, 3L, 1L, 3L, 0L, 6L, 7L, 
1L, 7L, 4L, 5L, 1L, 2L, 6L, 2L, 0L, 1L, 3L, 0L, 1L, 1L, 3L, 0L, 
1L, 11L, 2L, 5L, 1L, 2L, 1L), fieldGoalAttempts = c(8L, 6L, 7L, 
2L, 9L, 16L, 3L, 1L, 2L, 12L, 4L, 12L, 3L, 11L, 4L, 9L, 13L, 
6L, 12L, 10L, 14L, 2L, 6L, 11L, 6L, 2L, 2L, 6L, 0L, 5L, 3L, 10L, 
2L, 3L, 21L, 3L, 17L, 4L, 9L, 2L)), .Names = c("game_id", "team_id", 
"minutes_played", "fieldGoalsMade", "fieldGoalAttempts"), row.names = c(NA, 
40L), class = "data.frame")


head(boxscore_stats)

   game_id team_id minutes_played fieldGoalsMade fieldGoalAttempts
1   157046  116975          18.76              1                 8
2   157046  116975          14.63              1                 6
3   157046  116975           8.00              4                 7
4   157046  116975          16.69              1                 2
5   157046  116975          24.62              2                 9
6   157046  116975          32.00              7                16
7   157046  116975          12.79              1                 3
8   157046  116975           5.28              1                 1
9   157046  116975           3.22              1                 2
10  157046  120310          24.35              4                12
11  157046  120310          10.18              0                 4
12  157046  120310          20.65              3                12
13  157046  120310           9.59              1                 3
14  157046  120310          25.08              3                11
15  157046  120310          14.12              0                 4
16  157046  120310          17.46              6                 9
17  157046  120310          23.15              7                13
18  157046  120310          15.43              1                 6
19  159151  121910          22.84              7                12
20  159151  121910          19.27              4                10
21  159151  121910          21.31              5                14
22  159151  121910           6.41              1                 2
23  159151  121910          17.57              2                 6
24  159151  121910          17.40              6                11
25  159151  121910          17.29              2                 6
26  159151  121910           7.22              0                 2
27  159151  121910          12.09              1                 2
28  159151  121910          17.25              3                 6
29  159151  122072           2.28              0                 0
30  159151  122072          16.87              1                 5
31  159151  122072           6.60              1                 3
32  159151  122072          19.73              3                10
33  159151  122072           6.31              0                 2
34  159151  122072          13.25              1                 3
35  159151  122072          26.25             11                21
36  159151  122072           6.08              2                 3
37  159151  122072          28.71              5                17
38  159151  122072          11.20              1                 4
39  159151  122072          17.54              2                 9
40  159151  122072           5.17              1                 2

关于这个数据框的重要注意事项是每个game_id对应两个team_id,对于在游戏中玩的两个团队。每个game_id对于一场篮球比赛来说都是独一无二的。每行对应于该游戏中team_ids团队中玩家的统计数据。上面的示例只有两个游戏/ 4个团队/ 40个玩家,但我的完整数据框有数百个游戏,每个团队都会出现多次。

我能够做的第一个聚合是通过team_id聚合所有内容。这段代码为我完成了第一次聚合的工作:

boxscore_stats_aggregated = aggregate(boxscore_stats, by = list(boxscore_stats[, 2]), FUN = sum)

这是相当简单的。对于任何team_id,我已经汇总了他们所有的分钟,他们所有的fieldGoalsMade等等。对于我的下一个聚合,我需要再次通过team_id聚合,而不是通过他们自己的行/统计数据聚合团队,而是我需要汇总对手的行/统计数据。这回答了这个问题"对于任何一支球队来说,他们总共允许多少场比赛马匹给对手等等。"所以在这种情况下,对于team_id = 116975,我想用team_id 120310聚合所有行。当然,下一次team_id 116975出现在我的数据帧中的新游戏中,很可能他们正在扮演一个不同的对手,所以这个聚合并不像team_id 120310聚合那么简单。

我认为我应该能够使用两个team_id之间的关系,这对于独特的game_ids是唯一的,以使这种聚合成为可能,但我正在努力实现它的实现方式。

谢谢!

3 个答案:

答案 0 :(得分:2)

这是使用data.table的方法:

(1)读入数据:

# Load package
library(data.table)

# Load your data
boxscore_stats <- fread("row game_id team_id minutes_played fieldGoalsMade fieldGoalAttempts
1   157046  116975          18.76              1                 8
           2   157046  116975          14.63              1                 6
           3   157046  116975           8.00              4                 7
           4   157046  116975          16.69              1                 2
           5   157046  116975          24.62              2                 9
           6   157046  116975          32.00              7                16
           7   157046  116975          12.79              1                 3
           8   157046  116975           5.28              1                 1
           9   157046  116975           3.22              1                 2
           10  157046  120310          24.35              4                12
           11  157046  120310          10.18              0                 4
           12  157046  120310          20.65              3                12
           13  157046  120310           9.59              1                 3
           14  157046  120310          25.08              3                11
           15  157046  120310          14.12              0                 4
           16  157046  120310          17.46              6                 9
           17  157046  120310          23.15              7                13
           18  157046  120310          15.43              1                 6
           19  159151  121910          22.84              7                12
           20  159151  121910          19.27              4                10
           21  159151  121910          21.31              5                14
           22  159151  121910           6.41              1                 2
           23  159151  121910          17.57              2                 6
           24  159151  121910          17.40              6                11
           25  159151  121910          17.29              2                 6
           26  159151  121910           7.22              0                 2
           27  159151  121910          12.09              1                 2
           28  159151  121910          17.25              3                 6
           29  159151  122072           2.28              0                 0
           30  159151  122072          16.87              1                 5
           31  159151  122072           6.60              1                 3
           32  159151  122072          19.73              3                10
           33  159151  122072           6.31              0                 2
           34  159151  122072          13.25              1                 3
           35  159151  122072          26.25             11                21
           36  159151  122072           6.08              2                 3
           37  159151  122072          28.71              5                17
           38  159151  122072          11.20              1                 4
           39  159151  122072          17.54              2                 9
           40  159151  122072           5.17              1                 2
           ")

(2)继续进行实际计算:

# Aggregate on team-and game level (data.table style)
boxscore_stats_aggregated  <-  boxscore_stats[, lapply(.SD, sum), by = list(game_id, team_id)] 

# Match EVERY team to each opponent, i.e. still two rows per game
# but columns for opponent's performance added.
# Some teams drops out in the dummy data as they opponent data was missing.
merge(boxscore_stats_aggregated, boxscore_stats_aggregated, 
      by="game_id", suffixes = c("", ".opponent"))[team_id!=team_id.opponent,]

output看起来像这样:

# > output
#    game_id team_id row minutes_played fieldGoalsMade fieldGoalAttempts team_id.opponent row.opponent minutes_played.opponent fieldGoalsMade.opponent fieldGoalAttempts.opponent
# 1: 1413414  116975  45         135.99             19                54           120310          126                  160.01                      25                         74
# 2: 1413414  120310 126         160.01             25                74           116975           45                  135.99                      19                         54

答案 1 :(得分:1)

以防万一,OP考虑或未来的读者是merge()的基础R版本,用于game_id的团队和反对的并排聚合。需要一个临时临时文件gamecount

# TEAM AGGREGATION
aggdf <- aggregate(.~game_id + team_id, boxscore_stats, FUN = sum)

# GAME COUNT BY TEAM (TEMP COL USED FOR MERGE/FILTER)
aggdf$gamecount <- sapply(1:nrow(aggdf), function(i) 
                          sum(aggdf[1:i, c("game_id")] == aggdf$game_id[i]))    
# MERGE AND FILTER
mdf <- merge(aggdf, aggdf, by="game_id")
mdf <- mdf[mdf$team_id.x != mdf$team_id.y & mdf$gamecount.x == 1,]
mdf$gamecount.x <- mdf$gamecount.y <- NULL

# RENAME COL AND ROW NAMES
names(mdf)[grepl("\\.x", names(mdf))] <- gsub("\\.x", "", 
                                              names(mdf)[grepl("\\.x", names(mdf))])
names(mdf)[grepl("\\.y", names(mdf))] <- gsub("\\.y", ".opp", 
                                              names(mdf)[grepl("\\.y", names(mdf))])
rownames(mdf) <- 1:nrow(mdf)

#   game_id team_id minutes_played fieldGoalsMade fieldGoalAttempts team_id.opp
# 1  157046  116975         135.99             19                54      120310
# 2  159151  121910         158.65             31                71      122072
#   minutes_played.opp fieldGoalsMade.opp fieldGoalAttempts.opp
# 1             160.01                 25                    74
# 2             159.99                 28                    79

答案 2 :(得分:0)

如果你想隔离单个team_id,我会使用dplyr包。

例如,如果您想知道每个团队的字段目标百分比,我会写出类似的内容:

LinearLayout.LayoutParams l=new LinearLayout.LayoutParams();
l.width = l.height = LinearLayout.LayoutParams.WRAP_CONTENT;

这会为您提供由团队boxscore_stats %>% group_by(team_id) %>% summarize(perc_fg = sum(fieldGoalsMade)/sum(fieldGoalAttempts)) 汇总的新data.frame