我的数据框排列如下:
df <- structure(list(name1 = c("A","A","B","B","A","A","B","B"),
name2 = c("B","B","C","C","ALL","ALL","ALL","ALL"),
pair_id = c(1,1,2,2,3,3,4,4),
year = c(2010, 2011, 2010, 2011, 2010, 2011,2010, 2011),
var1 = c(1.5,2,4,5,12,15,20,18),
var2 = c(8,10,24,5.5,35,28,27,26),
var3 = c(25,6,12,18.5,30,41,33,38)),
.Names = c("name1","name2","pair_id","year", "var1", "var2", "var3"),
row.names = c("1", "2", "3", "4", "5", "6", "7", "8"), class =("data.frame"))
为方便起见,我只显示3个变量(var1,var2,var3)。我想按年计算所有变量(例如var1,var2,var3)的总和(ALL)的百分比份额和pair_id。所需的输出应为
df <- structure(list(name1 = c("A","A","B","B","A","A","B","B"),
name2 = c("B","B","C","C","ALL","ALL","ALL","ALL"),
pair_id = c(1,1,2,2,3,3,4,4),
year = c(2010, 2011, 2010, 2011, 2010, 2011,2010, 2011),
var1 = c(1.5,2,4,5,12,15,20,18),
var2 = c(8,10,24,5.5,15,18,7,22),
var3 = c(25,6,12,18.5,29,11,12,30),
var1_share = c(0.125, 0.133333333, 0.2, 0.277777778, 1, 1, 1, 1),
var2_share = c(0.228571429, 0.357142857, 0.888888889, 0.211538462, 1, 1, 1, 1),
var3_share = c(0.833333333, 0.146341463, 0.363636364, 0.486842105, 1, 1, 1, 1)),
.Names = c("name1","name2","pair_id","year", "var1", "var2", "var3", "var1_share", "var2_share", "var3_share"),
row.names = c("1", "2", "3", "4", "5", "6", "7", "8"), class =("data.frame"))
这是我的尝试。语法肯定是错误的。
varlist <- c("var1","var2","var3")
for (var in varlist) {
df <- df %>%
group_by(name1, year) %>%
mutate(denom = var[name2 == "ALL"]) %>%
group_by(pair_id, add = TRUE) %>%
mutate(var_share = (var/denom)*100)
}
您的意见将受到赞赏。
答案 0 :(得分:0)
dplyr
和tidyr
:
library(dplyr)
library(tidyr)
df %>%
gather(var,val, var1, var2, var3) %>%
inner_join(.,filter(., name2=="ALL"), c("name1", "year", "var")) %>%
mutate(var_share_val = val.x / val.y,
var_share_var = paste0(var, "_share"),
name2 = name2.x, name2.x = NULL,
pair_id = pair_id.x, pair_id.x = NULL,
pair_id.y = NULL,name2.y = NULL, val.y = NULL) %>%
spread(var, val.x) %>%
spread(var_share_var, var_share_val) %>%
group_by(name1, name2, year) %>%
summarize_all(.funs=funs(max(., na.rm=TRUE)))
name1 name2 year pair_id var1 var2 var3 var1_share var2_share var3_share
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A ALL 2010 3 12.0 35.0 30.0 1.0000000 1.0000000 1.0000000
2 A ALL 2011 3 15.0 28.0 41.0 1.0000000 1.0000000 1.0000000
3 A B 2010 1 1.5 8.0 25.0 0.1250000 0.2285714 0.8333333
4 A B 2011 1 2.0 10.0 6.0 0.1333333 0.3571429 0.1463415
5 B ALL 2010 4 20.0 27.0 33.0 1.0000000 1.0000000 1.0000000
6 B ALL 2011 4 18.0 26.0 38.0 1.0000000 1.0000000 1.0000000
7 B C 2010 2 4.0 24.0 12.0 0.2000000 0.8888889 0.3636364
8 B C 2011 2 5.0 5.5 18.5 0.2777778 0.2115385 0.4868421