在R中,按2列分组,然后计算第3列的百分比频率

时间:2018-12-23 23:16:15

标签: r dplyr data-manipulation

标题有点罗word,但是下面的示例非常清楚地突出了我正在努力解决的问题。我可以通过for循环实现所需的结果,但是更愿意使用非for循环的解决方案:

dput(mydf)
structure(list(shooterFullName = c("Ky Bowman", "Ky Bowman", 
"Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman", 
"Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman", "Ky Bowman", 
"Markell Johnson", "Markell Johnson", "Markell Johnson", "Markell Johnson", 
"Markell Johnson", "Markell Johnson", "Markell Johnson", "Markell Johnson", 
"Markell Johnson", "Markell Johnson", "Markell Johnson", "Markell Johnson"
), distanceCategory = c("atr2", "atr2", "lng2", "lng2", "lng3", 
"lng3", "mid2", "mid2", "sht2", "sht2", "sht3", "sht3", "atr2", 
"atr2", "lng2", "lng2", "lng3", "lng3", "mid2", "mid2", "sht2", 
"sht2", "sht3", "sht3"), season = c("1718", "1819", "1718", "1819", 
"1718", "1819", "1718", "1819", "1718", "1819", "1718", "1819", 
"1718", "1819", "1718", "1819", "1718", "1819", "1718", "1819", 
"1718", "1819", "1718", "1819"), plAttempts = c(49L, 12L, 30L, 
10L, 60L, 29L, 25L, 20L, 46L, 44L, 66L, 34L, 23L, 10L, 4L, 3L, 
15L, 13L, 12L, 8L, 27L, 16L, 31L, 27L)), row.names = c(NA, -24L
), class = c("tbl_df", "tbl", "data.frame"))

head(mydf, 18)
   shooterFullName distanceCategory season plAttempts
   <chr>           <chr>            <chr>       <int>
 1 Ky Bowman       atr2             1718           49
 2 Ky Bowman       atr2             1819           12
 3 Ky Bowman       lng2             1718           30
 4 Ky Bowman       lng2             1819           10
 5 Ky Bowman       lng3             1718           60
 6 Ky Bowman       lng3             1819           29
 7 Ky Bowman       mid2             1718           25
 8 Ky Bowman       mid2             1819           20
 9 Ky Bowman       sht2             1718           46
10 Ky Bowman       sht2             1819           44
11 Ky Bowman       sht3             1718           66
12 Ky Bowman       sht3             1819           34
13 Markell Johnson atr2             1718           23
14 Markell Johnson atr2             1819           10
15 Markell Johnson lng2             1718            4
16 Markell Johnson lng2             1819            3
17 Markell Johnson lng3             1718           15
18 Markell Johnson lng3             1819           13

我的目标如下-我想添加一个新列distFreq,将其按shooterFullNameseason分组,然后计算玩家的百分比plAttempts分别属于该行的distanceCategory

这是一个使用for循环的示例,但是出于各种原因,我想避免使用for循环。

mydf$distFreq = 0
all_players = unique(mydf$shooterFullName)
all_years = unique(mydf$season)
for(i in 1:length(all_players)) {
  for(j in 1:length(all_years)) {
    subsetdf <- mydf %>%
      dplyr::filter(shooterFullName == all_players[i]) %>%
      dplyr::filter(season == all_years[j])

    subsetdf <- subsetdf %>%
      dplyr::mutate(distFreq = plAttempts / sum(plAttempts))

    mydf$distFreq[mydf$shooterFullName == all_players[i] & mydf$season == all_years[j]] = subsetdf$distFreq
  }
}

head(mydf, 12)
   shooterFullName distanceCategory season plAttempts distFreq
   <chr>           <chr>            <chr>       <int>    <dbl>
 1 Ky Bowman       atr2             1718           49   0.178 
 2 Ky Bowman       atr2             1819           12   0.0805
 3 Ky Bowman       lng2             1718           30   0.109 
 4 Ky Bowman       lng2             1819           10   0.0671
 5 Ky Bowman       lng3             1718           60   0.217 
 6 Ky Bowman       lng3             1819           29   0.195 
 7 Ky Bowman       mid2             1718           25   0.0906
 8 Ky Bowman       mid2             1819           20   0.134 
 9 Ky Bowman       sht2             1718           46   0.167 
10 Ky Bowman       sht2             1819           44   0.295 
11 Ky Bowman       sht3             1718           66   0.239 
12 Ky Bowman       sht3             1819           34   0.228 

最好使用dplyr解决方案,但是我为此在dplyr上苦苦挣扎-任何对此的想法将不胜感激,谢谢!

1 个答案:

答案 0 :(得分:2)

我们可以执行以下操作:

mydf %>% group_by(shooterFullName, season) %>% 
  mutate(distFreq = plAttempts / sum(plAttempts))
# A tibble: 24 x 5
# Groups:   shooterFullName, season [4]
#    shooterFullName distanceCategory season plAttempts distFreq
#    <chr>           <chr>            <chr>       <int>    <dbl>
#  1 Ky Bowman       atr2             1718           49   0.178 
#  2 Ky Bowman       atr2             1819           12   0.0805
#  3 Ky Bowman       lng2             1718           30   0.109 
#  4 Ky Bowman       lng2             1819           10   0.0671
#  5 Ky Bowman       lng3             1718           60   0.217 
#  6 Ky Bowman       lng3             1819           29   0.195 
#  7 Ky Bowman       mid2             1718           25   0.0906
#  8 Ky Bowman       mid2             1819           20   0.134 
#  9 Ky Bowman       sht2             1718           46   0.167 
# 10 Ky Bowman       sht2             1819           44   0.295 
# ... with 14 more rows

除了使用group_by使事情变得更加简洁以外,这正是您所做的。