用dplyr链循环通过列

时间:2017-07-13 20:36:35

标签: r for-loop dataframe dplyr

我的数据框排列如下:

df <- structure(list(name1 =  c("A","A","B","B","A","A","B","B"),
                 name2     =  c("B","B","C","C","ALL","ALL","ALL","ALL"),
                 pair_id   =  c(1,1,2,2,3,3,4,4),
                 year      =  c(2010, 2011, 2010, 2011, 2010, 2011,2010, 2011),
                 var1      =  c(1.5,2,4,5,12,15,20,18),
                 var2      =  c(8,10,24,5.5,35,28,27,26),
                 var3      =  c(25,6,12,18.5,30,41,33,38)),
            .Names         =  c("name1","name2","pair_id","year", "var1", "var2", "var3"),
            row.names      =  c("1", "2", "3", "4", "5", "6", "7", "8"), class =("data.frame"))

为方便起见,我只显示3个变量(var1,var2,var3)。我想按年计算所有变量(例如var1,var2,var3)的总和(ALL)的百分比份额和pair_id。所需的输出应为

df <- structure(list(name1       =  c("A","A","B","B","A","A","B","B"),
                 name2           =  c("B","B","C","C","ALL","ALL","ALL","ALL"),
                 pair_id         =  c(1,1,2,2,3,3,4,4),
                 year            =  c(2010, 2011, 2010, 2011, 2010, 2011,2010, 2011),
                 var1            =  c(1.5,2,4,5,12,15,20,18),
                 var2            =  c(8,10,24,5.5,15,18,7,22),
                 var3            =  c(25,6,12,18.5,29,11,12,30),
                 var1_share      =  c(0.125, 0.133333333, 0.2, 0.277777778, 1, 1, 1, 1),
                 var2_share      =  c(0.228571429, 0.357142857, 0.888888889, 0.211538462, 1, 1, 1, 1),
                 var3_share      =  c(0.833333333, 0.146341463, 0.363636364, 0.486842105, 1, 1, 1, 1)),
            .Names               =  c("name1","name2","pair_id","year", "var1", "var2", "var3", "var1_share", "var2_share", "var3_share"),
            row.names            =  c("1", "2", "3", "4", "5", "6", "7", "8"), class =("data.frame"))

这是我的尝试。语法肯定是错误的。

varlist <- c("var1","var2","var3")
for (var in varlist) { 
    df <- df %>%
    group_by(name1, year) %>%
    mutate(denom = var[name2 == "ALL"]) %>%
    group_by(pair_id, add = TRUE) %>%
    mutate(var_share = (var/denom)*100)
    }

您的意见将受到赞赏。

1 个答案:

答案 0 :(得分:0)

dplyrtidyr

library(dplyr)
library(tidyr)

df %>%
  gather(var,val, var1, var2, var3) %>%
  inner_join(.,filter(., name2=="ALL"), c("name1", "year", "var")) %>%
  mutate(var_share_val = val.x / val.y, 
         var_share_var = paste0(var, "_share"), 
         name2 = name2.x, name2.x = NULL, 
         pair_id = pair_id.x, pair_id.x = NULL, 
         pair_id.y = NULL,name2.y = NULL, val.y = NULL) %>%
  spread(var, val.x) %>%
  spread(var_share_var, var_share_val) %>%
  group_by(name1, name2, year) %>%
  summarize_all(.funs=funs(max(., na.rm=TRUE)))

  name1 name2  year pair_id  var1  var2  var3 var1_share var2_share var3_share
  <chr> <chr> <dbl>   <dbl> <dbl> <dbl> <dbl>      <dbl>      <dbl>      <dbl>
1     A   ALL  2010       3  12.0  35.0  30.0  1.0000000  1.0000000  1.0000000
2     A   ALL  2011       3  15.0  28.0  41.0  1.0000000  1.0000000  1.0000000
3     A     B  2010       1   1.5   8.0  25.0  0.1250000  0.2285714  0.8333333
4     A     B  2011       1   2.0  10.0   6.0  0.1333333  0.3571429  0.1463415
5     B   ALL  2010       4  20.0  27.0  33.0  1.0000000  1.0000000  1.0000000
6     B   ALL  2011       4  18.0  26.0  38.0  1.0000000  1.0000000  1.0000000
7     B     C  2010       2   4.0  24.0  12.0  0.2000000  0.8888889  0.3636364
8     B     C  2011       2   5.0   5.5  18.5  0.2777778  0.2115385  0.4868421