对于id
的不同值,我的start
和end
日期的相对数量为var
。
对于每条记录(对于相同的id
),start
日期与之前的end
日期相同(此处为roll
...)。
这些时期跨越多个月甚至几年。我需要将var
中的数量分成相对于每个月的实际天数的部分。 e.g。
start end var
30/01/2006 20/02/2006 104
以上我有21天,下限将属于上一期,上限为当前,所以104的1/21将分配到2006年1月,其余的将分配到2006年2月
我目前有两种方法,下面列出了虚拟数据,但它们非常慢,我想知道是否有人可以帮助我加快速度。
library(data.table)
# data
set.seed(1)
nsample <- 200L # To increase the data size just change nsample
dt <- data.table(id= 1L:nsample)
dt <- dt[, list(date=sample(seq(as.Date("2006-01-01"), as.Date("2012-01-01"), "day"), 51, F)), by=id]
setkey(dt)
dt <- dt[, {tmp <- embed(as.vector(date), 2);list(start = structure(tmp[,2], class="Date"),
end = structure(tmp[,1], class="Date"),
var = rnorm(50, 100, 5))}, by=id]
setkey(dt, id, end)
> dt[1:4]
id start end var
1: 1 2006-01-30 2006-02-20 104.41542
2: 1 2006-02-20 2006-05-15 106.89356
3: 1 2006-05-15 2006-08-21 106.71162
4: 1 2006-08-21 2006-09-30 96.21729
# Method 1
dt1 <- copy(dt)
system.time({
dt1[, id2 := 1:.N]
tmp <- dt1[, list(id = id,
date = seq(start+1, end, "day"),
var = var), by=id2]
tmp[, var := var/(.N), by=id2]
res1 <- tmp[, list(var = sum(var)), by=list(id, period = paste(year(date), month(date), sep="-"))]
})
#user system elapsed
#1.92 0.00 1.92
# Method 2
dt2 <- copy(dt)
system.time({
dt2[, Ndays := as.integer(end)-as.integer(start)]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
setkey(tmp)
res2 <- dt2[ tmp, roll=-Inf][ end >= start,list(var = sum(var/Ndays)), by=list(id, period = paste(year(end), month(end), sep="-")) ]
})
#user system elapsed
# 0.7 0.0 0.7
> sum(dt$var) == sum(res1$var)
[1] TRUE
> sum(dt$var) == sum(res2$var)
[1] TRUE
> all.equal(res1, res2)
[1] TRUE
> res2[1:4]
id period var
1: 1 2006-1 4.972163
2: 1 2006-2 109.623593
3: 1 2006-3 39.448815
4: 1 2006-4 38.176273
答案 0 :(得分:2)
这会快一点(对我来说比第二版快3倍)。我在你的第二个版本中优化了几个东西,你可以在下面看到:
# let's just divide here instead of later
dt2[, var := var/(as.integer(end)-as.integer(start))]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
# data is sorted, so no need to sort again, just set key without sort
setattr(tmp, "sorted", c("id", "date"))
res2 <- dt2[tmp, roll=-Inf][,
list(var = sum(var)),
# doing the paste in by slows it down quite a bit, so let's postpone it
by=list(id, year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
重新评论大尺寸 - 您可以在dt2
内完成上述所有操作。它会慢一点,但我不会创建一个大的tmp
:
dt2[, var := var/(as.integer(end)-as.integer(start))][,
{tmp = data.table(date = seq(min(start)+1, max(end), "day"));
setattr(tmp, 'sorted', 'date');
setattr(.SD, 'sorted', 'end');
.SD[tmp, roll = -Inf][,
list(var = sum(var)), by = list(year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
}, by = id]