我有以下data.table
:
CODE ID VALUE YEAR_MONTH temp_YEAR_MONTH
1: ABOUDERE 12608095 1 199206 1992-06-01
2: ABOUDERE 12608095 1 199207 1992-07-01
3: ABOUDERE 12608095 1 199208 1992-08-01
4: ABOUDERE 12608095 1 199209 1992-09-01
5: ABOUDERE 12608095 1 199210 1992-10-01
---
1012974: DCBEZOND 88619 1 201711 2017-11-01
1012975: ABOUDERE 90325 1 201711 2017-11-01
1012976: ABOUDERE 91301 1 201711 2017-11-01
1012977: ABOUDERE 91808 1 201711 2017-11-01
1012978: ABOUDERE 92866 1 201711 2017-11-01
我希望拥有的是一个额外的专栏,它告诉我ID是如何出现的,这是过去5年的唯一时间......仅限(最多60个)
E.G。
CODE ID VALUE YEAR_MONTH temp_YEAR_MONTH APPEARANCES_LAST_5_YEARS
1: ABOUDERE 12608095 1 199206 1992-06-01 1
2: ABOUDERE 12608095 1 199207 1992-07-01 2
3: ABOUDERE 12608095 1 199208 1992-08-01 3
4: ABOUDERE 12608095 1 199209 1992-09-01 4
5: ABOUDERE 12608095 1 199210 1992-10-01 5
---
1012978: ABOUDERE 92866 1 201711 2017-11-01 60
我这样做的方式是通过:
dt$temp_YEAR_MONTH <- as.Date(paste(dt$YEAR_MONTH,'01'), format = '%Y%m%d')
dt$APPEARANCES_LAST_5_YEARS = 0
tmp.temp_YEAR_MONTH = dt$temp_YEAR_MONTH
tmp.ID= dt$ID
id_date_function <- function(id, date){
sum(tmp.ID == id & tmp.temp_YEAR_MONTH < as.Date(paste(date,'01'), format = '%Y%m%d') &
tmp.temp_YEAR_MONTH > as.Date(paste(as.numeric(date)-500,'01'), format = '%Y%m%d'))
}
print('this will take some time')
dt$APPEARANCES_LAST_5_YEARS <-
apply(dt, 1, function(x) id_date_function(x['ID'], x['YEAR_MONTH']))
但这很慢......对于1.000.000记录,需要+ 13小时。 有人有更好的方法吗?
答案 0 :(得分:2)
这可以使用范围连接(非等连接)来解决,并在使用by = .EACHI
加入期间进行聚合:
library(data.table)
library(lubridate)
DT[, mon := ymd(YEAR_MONTH, truncated = 1L)][
, APPEARANCES_LAST_5_YEARS :=
.SD[.(ID, mon, mon - months(5L * 12L)),
on = .(ID, mon <= V2, mon > V3), .N, by = .EACHI]$N][, mon := NULL][]
CODE ID VALUE YEAR_MONTH APPEARANCES_LAST_5_YEARS 1: ABOUDERE 12608095 1 199206 1 2: ABOUDERE 12608095 1 199207 2 3: ABOUDERE 12608095 1 199208 3 4: ABOUDERE 12608095 1 199209 4 5: ABOUDERE 12608095 1 199210 5 6: DCBEZOND 88619 1 201711 1 7: ABOUDERE 90325 1 201711 1 8: ABOUDERE 91301 1 201711 1 9: ABOUDERE 91808 1 201711 1 10: ABOUDERE 92866 1 201711 1
不幸的是,OP提供的样本数据集不足以覆盖5年的时间。为了证明只考虑了一段时间,为了示范目的,期限限制在3个月:
DT[, mon := ymd(YEAR_MONTH, truncated = 1L)][
, APPEARANCES_LAST_3_MONTHS :=
.SD[.(ID, mon, mon - months(3L)),
on = .(ID, mon <= V2, mon > V3), .N, by = .EACHI]$N][, mon := NULL][]
CODE ID VALUE YEAR_MONTH APPEARANCES_LAST_3_MONTHS 1: ABOUDERE 12608095 1 199206 1 2: ABOUDERE 12608095 1 199207 2 3: ABOUDERE 12608095 1 199208 3 4: ABOUDERE 12608095 1 199209 3 5: ABOUDERE 12608095 1 199210 3 6: DCBEZOND 88619 1 201711 1 7: ABOUDERE 90325 1 201711 1 8: ABOUDERE 91301 1 201711 1 9: ABOUDERE 91808 1 201711 1 10: ABOUDERE 92866 1 201711 1
library(data.table)
DT <- fread("id CODE ID VALUE YEAR_MONTH temp_YEAR_MONTH
1: ABOUDERE 12608095 1 199206 1992-06-01
2: ABOUDERE 12608095 1 199207 1992-07-01
3: ABOUDERE 12608095 1 199208 1992-08-01
4: ABOUDERE 12608095 1 199209 1992-09-01
5: ABOUDERE 12608095 1 199210 1992-10-01
1012974: DCBEZOND 88619 1 201711 2017-11-01
1012975: ABOUDERE 90325 1 201711 2017-11-01
1012976: ABOUDERE 91301 1 201711 2017-11-01
1012977: ABOUDERE 91808 1 201711 2017-11-01
1012978: ABOUDERE 92866 1 201711 2017-11-01 ",
drop = c(1L, 6L))
DT
CODE ID VALUE YEAR_MONTH 1: ABOUDERE 12608095 1 199206 2: ABOUDERE 12608095 1 199207 3: ABOUDERE 12608095 1 199208 4: ABOUDERE 12608095 1 199209 5: ABOUDERE 12608095 1 199210 6: DCBEZOND 88619 1 201711 7: ABOUDERE 90325 1 201711 8: ABOUDERE 91301 1 201711 9: ABOUDERE 91808 1 201711 10: ABOUDERE 92866 1 201711