我正在尝试使用R中的data.table优化r代码片段以计算滞后差异。我有两个工作解决方案,但两者都在我的真实数据(5亿行数据集)上运行缓慢。我一般都喜欢使用data.table的加速和效率,但我实现的两个解决方案都很慢(与其他data.table操作相比)。
是否有人可以为data -table提供更高效的编码实践建议?
library(data.table)
set.seed(1)
id <- 1:10
date_samp <- seq.Date(as.Date("2010-01-01"),as.Date("2011-01-01"),"days")
dt1 <-
data.table(id = sample(id,size = 30,replace=T),
date_1 = sample(date_samp,size = 30,replace=T))
setkey(dt1,id,date_1)
### Attempt to get lagged date
## Attempt 1
dt1[,date_diff:=c(0,diff(date_1)),
by=id]
## Attempt 2
## Works but gives warnings
dt1[,date_diff:=NULL]
dt1[,n_group := .N,by=id]
dt1[,date_diff:=c(0,date_1[2:n_group]-date_1[1:(n_group-1)]),
by=id]
答案 0 :(得分:7)
经过一番努力后,我在相关问题上找到了“shift()”函数。我已经使数据更大并进行了一些粗略的分析,并添加了一些更多的方法......但如果有更有效的方法,请更新并提供不同的答案。
在回复下面的评论时,我添加并更改了一些内容...尝试是数字(不是整数),而我的键入是不正确的。我添加了一个整数比较和一个键入的整数(除了数字)。现在看起来像将日期转换为整数然后使用“按每个i分组”是最快的解决方案。
library(data.table)
set.seed(1)
id <- 1:100
date_samp <- seq.Date(as.Date("2010-01-01"),as.Date("2011-01-01"),"days")
n_samp <- 1e7
dt1 <-
data.table(id = sample(id,size = n_samp,replace=T),
date_1 = sample(date_samp,size = n_samp,replace=T))
setkey(dt1,id,date_1)
### Attempt to get lagged date
## Attempt 1
dt1[,date_diff:=NULL]
system.time(dt1[,date_diff:=c(0,diff(date_1)),
by=id])
## Attempt 2
dt1[,date_diff:=NULL]
dt1[,n_group := .N,by=id]
system.time(dt1[,date_diff:=c(0,date_1[2:n_group]-date_1[1:(n_group-1)]),
by=id])
## Attempt 3
dt1[,date_diff:=NULL]
system.time(dt1[,date_diff:=date_1-shift(date_1),
by=id])
## Attempt 4
## Use numeric instead
dt1[,date_diff:=NULL]
dt1[,date_1num:=NULL]
dt1[,date_1num:=as.numeric(date_1)]
system.time(dt1[,date_diff:=date_1num-shift(date_1num),
by=id])
## Attempt 5
## Use a keyed by
dt_key <- unique(dt1[,list(id)])
dt1[,date_diff:=NULL]
system.time(dt1[dt_key,
date_diff:=date_1num-shift(date_1num),
by=.EACHI])
## Attempt 6
## Use integers instead
dt1[,date_diff:=NULL]
dt1[,date_1int:=as.integer(date_1)]
system.time(dt1[,date_diff:=date_1int-shift(date_1int),
by=id])
## Attempt 7
## Use integers with keyed by
dt1[,date_diff:=NULL]
dt1[,date_1int:=as.integer(date_1)]
system.time(dt1[dt_key,
date_diff:=date_1int-shift(date_1int),
by=.EACHI])
# attempt user system elapsed
# 1 0.34 0.25 0.59
# 2 0.37 0.28 0.67
# 3 0.25 0.16 0.41
# 4 0.11 0.01 0.13
# 5 0.06 0.03 0.10
# 6 0.09 0.00 0.09
# 7 0.05 0.00 0.04
答案 1 :(得分:1)
如果你想修改by:
dt1[order(id, date_1)
][, idP := shift(id, type='lag')
][, headP := is.na(idP) | idP != id
][, date_1P := shift(date_1, type='lag')
][headP == T, date_diff := 0
][headP == F, date_diff := date_1 - date_1P
][, c('headP', 'idP', 'date_1P') := NULL
][]