我努力寻找以下问题的解决方案。我有一个id's/ dob's
的df和另一个如下的monthbucket df
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
id = seq(1:10) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
我尝试获得一个输出,该输出为我提供了每个每月时段的年龄组(below 19\ 19-64\ above 64)
中的成员数量。显然,这一数字在人们过生日的那一年会发生变化。
我用类似
来计算年龄age.fct <- function(dob, bucketdate) {
period <- as.period(interval(dob, bucketdate),unit = "year")
period$year}
我想一般的方法是计算每个月度时段的年龄,将其分配给3 age groups
中的一个,然后按月计数。有什么建议吗?
编辑1。
由于采用了所有不同的方法,我只是对解决方案进行了简短的基准测试,以确定可以接受的答案。某种程度上,数据表解决方案无法在我的测试数据集上使用,但是接下来的几天我会尽快检查。
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
id = seq(1:10000) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
birth_days <- df$dob
month_bucket <- monthbucket$startmonth
和基准
microbenchmark::microbenchmark(
MM= monthbucket %>% group_by_all %>% expand(id=df$id) %>% left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>% mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>% group_by(month) %>% count(age_cat) %>% gather(variable, count, n) %>%
unite(variable, age_cat) %>% spread(variable, count)
,
AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
},
Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
},
# cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
# },
#
Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
},
Cole4={all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide' )
},
times = 1L)
Unit: milliseconds
expr min lq mean median uq max neval
MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 1
AkselA 17.12697 17.12697 17.12697 17.12697 17.12697 17.12697 1
Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 1
Cole3 23.63945 23.63945 23.63945 23.63945 23.63945 23.63945 1
Cole4 877.92782 877.92782 877.92782 877.92782 877.92782 877.92782 1
基于速度,AkselA的方法似乎是最快的,但是与其他方法相比,MM的方法得到了不同的结果(一旦在削减部分cut, c(0, 19, 64, Inf)..
中将AkselA的值更改为65。我将接受基于速度,但会查看结果差异!
答案 0 :(得分:3)
不太复杂,但我加入了两个表(首先在monthbucket
上展开了df$id
),然后计算了年龄(因为您有整个月的时间,我只是使用{出生月份的第一天和difftime
)。然后,对于每个月(存储桶),我计算了不同年龄组的数量,最后将长格式转换为宽格式,以便更好地说明。
startmonth
由reprex package(v0.3.0)于2019-07-03创建
答案 1 :(得分:2)
假设我了解您的要求。
ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame,
lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
# 2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19 2 2 2 2 2 2 2 2 2 2 2 2 2
# 19-64 7 7 7 7 7 7 7 7 7 7 7 7 7
# 64+ 1 1 1 1 1 1 1 1 1 1 1 1 1
#
答案 2 :(得分:2)
@AkselA的答案有一些相似之处,因为它取决于outer()
,cut()
和table()
。
set.seed(33)
birth_days <- sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10)
month_bucket <- seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months")
t(
table(
apply(
X = outer(month_bucket, birth_days, `-`) / 365.25
, MARGIN = 2
, FUN = cut, c(0,19,65, Inf)
)
, rep(format(month_bucket,'%Y-%m'), length(birth_days))
)
)
(0,19] (19,65] (65,Inf]
2010-01 2 7 1
2010-02 2 7 1
2010-03 2 7 1
2010-04 2 7 1
2010-05 2 7 1
2010-06 2 7 1
2010-07 2 7 1
2010-08 2 7 1
2010-09 2 7 1
2010-10 2 7 1
2010-11 2 7 1
2010-12 2 7 1
2011-01 2 7 1
我觉得有一个类似的解决方案很奇怪,所以这里是data.table
:
library(data.table)
dcast(
CJ(month_bucket, birth_days
)[, .N
, by = .(month_bucket
, cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))
]
, month_bucket ~ cut
, value.var = 'N')
dplyr
和tidyr
:
library(dplyr)
library(tidyr)
crossing(month_bucket, birth_days)%>%
count(month_bucket
, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
)%>%
spread(age_range, n)
还有一种我不太满意的类似方法。
all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(
data = aggregate(
all_combos$month_bucket
, by = list(bucket = all_combos$month_bucket
,age_group = all_combos$cut_r)
, FUN = length)
, timevar = 'age_group'
, idvar = 'bucket'
, direction = 'wide'
)