我试图通过相等条件和两个不等式条件同时匹配两个数据表。这些是使用sqldf
的表格和所需的输出:
library(data.table)
library(sqldf)
set.seed(1)
DT1 = data.table(x = c(1,2,3,4,5),
y = c(15,25,35,45,55),
z = rnorm(5))
DT2 = data.table(a = c(2,4,5,8,9),
y_start = c(20,11,54,31,60),
y_end = c(27,14,55,37,70),
t = sample(1000:2000,size = 5))
x y z
1 15 -0.626453811
2 25 0.183643324
3 35 -0.835628612
4 45 1.595280802
5 55 0.329507772
a y_start y_end t
2 20 27 1206
4 11 14 1176
5 54 55 1686
8 31 37 1383
9 60 70 1767
output = sqldf("select DT1.*, DT2.t
from DT1 left join DT2
on DT1.x = DT2.a
and DT1.y >= DT2.y_start
and DT1.y <= DT2.y_end")
#desired output
x y z t
1 15 -0.626453811 NA
2 25 0.183643324 1206
3 35 -0.835628612 NA
4 45 1.595280802 NA
5 55 0.329507772 1686
尝试使用data.table
实现相同的输出我设法提出以下代码,但输出并不是我需要的:
DT = DT1[DT2, on = .(x = a, y >= y_start, y <= y_end), nomatch = 0L]
x y z y.1 t
2 20 0.183643324 27 1206
5 54 0.329507772 55 1686
我可以使用此功能并附加DT1
中缺少的行并删除列y.1
,但也许有办法直接实现此目的?
答案 0 :(得分:3)
您还可以在第一个查询中包含变量选择。
DT2[DT1, .(x, y, z, t), on = .(a = x, y_start <= y, y_end >= y)]
# x y z t
#1: 1 15 -0.6264538 NA
#2: 2 25 0.1836433 1206
#3: 3 35 -0.8356286 NA
#4: 4 45 1.5952808 NA
#5: 5 55 0.3295078 1686
问候!
答案 1 :(得分:0)
我们可以加入反向连接
DT2[DT1, on = .(a= x, y_start <= y, y_end >= y)][, .(x = a, y = y_start, z, t)]
# x y z t
#1: 1 15 -0.6264538 NA
#2: 2 25 0.1836433 1206
#3: 3 35 -0.8356286 NA
#4: 4 45 1.5952808 NA
#5: 5 55 0.3295078 1686