标题有点罗word,但是下面的示例非常清楚地突出了我正在努力解决的问题。我可以通过for循环实现所需的结果,但是更愿意使用非for循环的解决方案:
dput(mydf)
structure(list(shooterFullName = c("Ky Bowman", "Ky Bowman",
"Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman",
"Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman",
"Markell Johnson", "Markell Johnson", "Markell Johnson", "Markell Johnson",
"Markell Johnson", "Markell Johnson", "Markell Johnson", "Markell Johnson",
"Markell Johnson", "Markell Johnson", "Markell Johnson", "Markell Johnson"
), distanceCategory = c("atr2", "atr2", "lng2", "lng2", "lng3",
"lng3", "mid2", "mid2", "sht2", "sht2", "sht3", "sht3", "atr2",
"atr2", "lng2", "lng2", "lng3", "lng3", "mid2", "mid2", "sht2",
"sht2", "sht3", "sht3"), season = c("1718", "1819", "1718", "1819",
"1718", "1819", "1718", "1819", "1718", "1819", "1718", "1819",
"1718", "1819", "1718", "1819", "1718", "1819", "1718", "1819",
"1718", "1819", "1718", "1819"), plAttempts = c(49L, 12L, 30L,
10L, 60L, 29L, 25L, 20L, 46L, 44L, 66L, 34L, 23L, 10L, 4L, 3L,
15L, 13L, 12L, 8L, 27L, 16L, 31L, 27L)), row.names = c(NA, -24L
), class = c("tbl_df", "tbl", "data.frame"))
head(mydf, 18)
shooterFullName distanceCategory season plAttempts
<chr> <chr> <chr> <int>
1 Ky Bowman atr2 1718 49
2 Ky Bowman atr2 1819 12
3 Ky Bowman lng2 1718 30
4 Ky Bowman lng2 1819 10
5 Ky Bowman lng3 1718 60
6 Ky Bowman lng3 1819 29
7 Ky Bowman mid2 1718 25
8 Ky Bowman mid2 1819 20
9 Ky Bowman sht2 1718 46
10 Ky Bowman sht2 1819 44
11 Ky Bowman sht3 1718 66
12 Ky Bowman sht3 1819 34
13 Markell Johnson atr2 1718 23
14 Markell Johnson atr2 1819 10
15 Markell Johnson lng2 1718 4
16 Markell Johnson lng2 1819 3
17 Markell Johnson lng3 1718 15
18 Markell Johnson lng3 1819 13
我的目标如下-我想添加一个新列distFreq
,将其按shooterFullName
和season
分组,然后计算玩家的百分比plAttempts
分别属于该行的distanceCategory
。
这是一个使用for循环的示例,但是出于各种原因,我想避免使用for循环。
mydf$distFreq = 0
all_players = unique(mydf$shooterFullName)
all_years = unique(mydf$season)
for(i in 1:length(all_players)) {
for(j in 1:length(all_years)) {
subsetdf <- mydf %>%
dplyr::filter(shooterFullName == all_players[i]) %>%
dplyr::filter(season == all_years[j])
subsetdf <- subsetdf %>%
dplyr::mutate(distFreq = plAttempts / sum(plAttempts))
mydf$distFreq[mydf$shooterFullName == all_players[i] & mydf$season == all_years[j]] = subsetdf$distFreq
}
}
head(mydf, 12)
shooterFullName distanceCategory season plAttempts distFreq
<chr> <chr> <chr> <int> <dbl>
1 Ky Bowman atr2 1718 49 0.178
2 Ky Bowman atr2 1819 12 0.0805
3 Ky Bowman lng2 1718 30 0.109
4 Ky Bowman lng2 1819 10 0.0671
5 Ky Bowman lng3 1718 60 0.217
6 Ky Bowman lng3 1819 29 0.195
7 Ky Bowman mid2 1718 25 0.0906
8 Ky Bowman mid2 1819 20 0.134
9 Ky Bowman sht2 1718 46 0.167
10 Ky Bowman sht2 1819 44 0.295
11 Ky Bowman sht3 1718 66 0.239
12 Ky Bowman sht3 1819 34 0.228
最好使用dplyr
解决方案,但是我为此在dplyr上苦苦挣扎-任何对此的想法将不胜感激,谢谢!
答案 0 :(得分:2)
我们可以执行以下操作:
mydf %>% group_by(shooterFullName, season) %>%
mutate(distFreq = plAttempts / sum(plAttempts))
# A tibble: 24 x 5
# Groups: shooterFullName, season [4]
# shooterFullName distanceCategory season plAttempts distFreq
# <chr> <chr> <chr> <int> <dbl>
# 1 Ky Bowman atr2 1718 49 0.178
# 2 Ky Bowman atr2 1819 12 0.0805
# 3 Ky Bowman lng2 1718 30 0.109
# 4 Ky Bowman lng2 1819 10 0.0671
# 5 Ky Bowman lng3 1718 60 0.217
# 6 Ky Bowman lng3 1819 29 0.195
# 7 Ky Bowman mid2 1718 25 0.0906
# 8 Ky Bowman mid2 1819 20 0.134
# 9 Ky Bowman sht2 1718 46 0.167
# 10 Ky Bowman sht2 1819 44 0.295
# ... with 14 more rows
除了使用group_by
使事情变得更加简洁以外,这正是您所做的。