我一直在尝试切换到dplyr和tidyr进行数据操作(而不是data.table和excel)。我有一个长格式的数据框,如下所示:
TIME GEO geo_num sex_num AGE Value
2014 EU28 1 1 0 13486357
2014 EU28 1 1 5 13683976
2014 EU28 1 1 10 13430899
2014 EU28 1 1 15 13945295
2014 EU28 1 1 20 15417002
2014 EU28 1 1 25 16233349
我想获得的是每个年龄组(AGE)的sex_num比例:
TIME GEO geo_num sex_num AGE Value percent
2014 EU28 1 1 0 13486357 0.537
2014 EU28 1 1 5 13683976 0.548
2014 EU28 1 1 10 13430899 0.537
2014 EU28 1 1 15 13945295 0.555
2014 EU28 1 1 20 15417002 0.613
2014 EU28 1 1 25 16233349 0.646
这样我就会按性别(我的分母)获得总数
mydata %>%
group_by(geo_num,sex_num,TIME) %>%
summarize(total_sex=sum(Value))
但如何使用它来获得百分比并不完全清楚
mydata %>%
group_by(sex_num, TIME, geo_num, AGE) %>%
mutate(freq = Value / total_sex)
有什么想法吗?
这是数据的子集
structure(list(X = 1:40, TIME = c(2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L
), GEO = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "EU28", class = "factor"),
geo_num = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), GEO.1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "European Union (28 countries)", class = "factor"),
SEX = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Females", "Males"), class = "factor"), sex_num = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), AGE = c(0, 5, 10, 15,
20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 85.99,
90.99, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60,
65, 70, 75, 80, 85, 85.99, 90.99), Value = c(13486357L, 13683976L,
13430899L, 13945295L, 15417002L, 16233349L, 17074499L, 17647415L,
18386977L, 18914596L, 17914397L, 16416147L, 14854062L, 12613840L,
10126857L, 8102599L, 5377238L, 2718258L, 3756915L, 1038657L,
12805779L, 12992860L, 12754636L, 13227105L, 14824565L, 15915997L,
16894408L, 17437631L, 18269939L, 18931544L, 18244203L, 17188595L,
16064384L, 14111303L, 12145307L, 10862721L, 8471793L, 5480758L,
8448678L, 2967920L)), .Names = c("X", "TIME", "GEO", "geo_num",
"GEO.1", "SEX", "sex_num", "AGE", "Value"), class = "data.frame", row.names = c(NA, -40L))
答案 0 :(得分:3)
这样的事情可能会让你得到你正在寻找的东西
mydata <- mydata %>%
group_by(TIME, GEO, geo_num, GEO.1, SEX, sex_num) %>%
mutate(total_sex = sum(Value),
percent = Value / total_sex * 100)
> head(mydata)
# A tibble: 6 x 11
X TIME GEO SEX AGE Value total_sex percent
1 2014 EU28 Males 0 13486357 251139335 5.370070
2 2014 EU28 Males 5 13683976 251139335 5.448759
3 2014 EU28 Males 10 13430899 251139335 5.347987
4 2014 EU28 Males 15 13945295 251139335 5.552812
5 2014 EU28 Males 20 15417002 251139335 6.138824
6 2014 EU28 Males 25 16233349 251139335 6.463881
# ... with 3 more variables
答案 1 :(得分:0)
我们可以使用data.table
library(data.table)
setDT(mydata)[, percent := 10*Value/sum(Value) , c(names(mydata)[2:7])]
head(mydata)
# X TIME GEO geo_num GEO.1 SEX sex_num AGE Value percent
#1: 1 2014 EU28 1 European Union (28 countries) Males 1 0 13486357 0.5370070
#2: 2 2014 EU28 1 European Union (28 countries) Males 1 5 13683976 0.5448759
#3: 3 2014 EU28 1 European Union (28 countries) Males 1 10 13430899 0.5347987
#4: 4 2014 EU28 1 European Union (28 countries) Males 1 15 13945295 0.5552812
#5: 5 2014 EU28 1 European Union (28 countries) Males 1 20 15417002 0.6138824
#6: 6 2014 EU28 1 European Union (28 countries) Males 1 25 16233349 0.6463881