从SO(Join data.table on exact date or if not the case on the nearest less than date)的其他地方窃取一个虚拟示例,我希望根据第一个日期(Dt1中的日期)严格早于第二个日期(Dt2中的日期)加入两个表。
还关闭了DataCombine解决方案的'slide'功能中的'warning'消息,因为它可能会不公平地减慢mtotos解决方案的速度。
library(data.table)
Dt1 <- read.table(text="
date x
1/26/2010, 10
1/25/2010, 9
1/24/2010, 9
1/22/2010, 7
1/19/2010, 11", header=TRUE, stringsAsFactors=FALSE)
Dt2 <- read.table(text="
date
1/26/2010
1/23/2010
1/20/2010", header=TRUE, stringsAsFactors=FALSE)
加入所需的结果
date x
1/26/2010 - 9 # based on closest observation strictly less than date
1/23/2010 - 7
1/20/2010 - 11
(我保留data.frame格式以输入mtoto的解决方案,并保留jangorecki的data.table)。
solution.mtoto = function(Df1, Df2)
{
#Full outer join of two df's
merged <- merge(Df1, Df2, by = "date", all = T, sort=T)
# Shifting values backwards by one using 'slide' from DataCombine
merged <- slide(merged, Var = "x", slideBy = -1, reminder = F)
# Inner join retaining the relevant cols
return(merge(Df2,merged)[,-2])
}
solution.jangorecki = function(Dt1, Dt2)
{
offset.roll.join = function(Dt1, Dt2){
Dt2[, jndate := date - 1L] # produce join column with offset
on.exit(Dt2[, jndate := NULL]) # cleanup join col on exit
Dt1[Dt2, .(date = i.date, x), on = c("date" = "jndate"), roll = Inf] # do rolling join
}
return(offset.roll.join(Dt1, Dt2))
}
res.mtoto = sapply(1:10, FUN = function(x){system.time({solution.mtoto(Df1, Df2)})})
res.jangorecki = sapply(1:10, FUN = function(x){system.time({solution.jangorecki(Dt1, Dt2)})})
> res.mtoto[c("user.self", "sys.self"),]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
user.self 0.004 0.004 0.004 0.004 0.003 0.003 0.003 0.003 0.003 0.003
sys.self 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
> res.jangorecki[c("user.self", "sys.self"),]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
user.self 0.005 0.005 0.004 0.004 0.005 0.004 0.004 0.004 0.003 0.004
sys.self 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
编辑,在mtoto的解决方案中意外地引用了Dt1而不是Df1。现在修复了。
类似的速度(在更大的数据集上可能更明显?)。我的另一个问题是我希望第二个表格中返回的日期。
例如,期望的结果是:
date - x - date2
1/26/2010 - 9 - 1/25/2010
1/23/2010 - 7 - 1/22/2010
1/20/2010 - 11 - 1/19/2010
答案 0 :(得分:4)
使用-1L
偏移滚动连接。
更新2016-04-02 :在当前开发版中使用this commit v1.9.7,可以在不创建临时列的情况下完成此操作。来自NEWS:
x的列可以始终使用前缀
j
在x.
中引用。当x的列也是连接列时,这尤其有用。这是一个解决#1615的补丁。
Dt2[, jndate := date - 1L]
Dt1[Dt2,
.(date = i.date, orgdate = x.date, x),
on = c("date" = "jndate"),
roll = Inf]
# date orgdate x
#1: 2010-01-26 2010-01-25 9
#2: 2010-01-23 2010-01-22 7
#3: 2010-01-20 2010-01-19 11
原始答案,如果您的年龄为1.9.6或更高,则非常有用。
library(data.table)
# data
Dt1 = fread("date x
1/26/2010, 10
1/25/2010, 9
1/24/2010, 9
1/22/2010, 7
1/19/2010, 11")[, date := as.IDate(date, format=("%m/%d/%Y"))][]
Dt2 = fread("date
1/26/2010
1/23/2010
1/20/2010")[, date := as.IDate(date, format=("%m/%d/%Y"))][]
# solution
offset.roll.join = function(Dt1, Dt2){
Dt2[, jndate := date - 1L] # produce join column with offset
Dt1[, orgdate := date] # should not be needed after data.table#1615
on.exit({Dt2[, jndate := NULL]; Dt1[, orgdate := NULL]}) # cleanup on exit
Dt1[Dt2, .(date = i.date, orgdate, x), on = c("date" = "jndate"), roll = Inf] # do rolling join
}
offset.roll.join(Dt1, Dt2)
# date orgdate x
#1: 2010-01-26 2010-01-25 9
#2: 2010-01-23 2010-01-22 7
#3: 2010-01-20 2010-01-19 11
答案 1 :(得分:1)
分三步:
library(DataCombine)
#Full outer join of two df's
merged <- merge(Dt1, Dt2, by = "date", all = T)
# Shifting values backwards by one using 'slide' from DataCombine
merged <- slide(merged, Var = "x", slideBy = -1)
# Inner join retaining the relevant cols
merge(Dt2,merged)[,-2]
# date x-1
#1 1/20/2010 11
#2 1/23/2010 7
#3 1/26/2010 9