我想通过指定一些确切的键匹配和一些最接近的匹配(roll =“ nearest”)来进行data.table表联接。下面的示例使用笛卡尔联接,不适用于较大的表。我可以按名称,地点,时间类型=(确切,确切,“最近”)加入吗?
#toy example
library(data.table)
library(lubridate)
set.seed(1)
df1 = data.table(name=sample(c("Andy","Beth","Chris"),10,replace = T),
site=sample(LETTERS[1:3],10,replace = T),
time1 = ymd_hm("2019-01-01 00:00")+dminutes(sample(seq(1,1e5),10))
)
df2 = data.table(name=sample(c("Andy","Beth","Chris"),10,replace = T),
site=sample(LETTERS[1:3],10,replace = T),
time2 = ymd_hm("2019-01-01 00:00")+dminutes(sample(seq(1,1e5),10))
)
> df1
name site time1
1: Andy A 2019-03-06 21:51:00
2: Beth A 2019-01-15 17:35:00
3: Beth C 2019-02-15 06:07:00
4: Chris B 2019-01-09 17:16:00
5: Andy C 2019-01-19 13:21:00
6: Chris B 2019-01-27 19:30:00
7: Chris C 2019-01-01 22:19:00
8: Beth C 2019-01-27 13:17:00
9: Beth B 2019-03-02 09:23:00
10: Andy C 2019-01-24 15:12:00
> df2
name site time2
1: Beth C 2019-02-03 04:02:00
2: Beth B 2019-03-01 19:21:00
3: Beth C 2019-01-31 10:09:00
4: Andy B 2019-01-17 23:59:00
5: Chris B 2019-01-05 21:48:00
6: Chris C 2019-01-07 21:47:00
7: Chris A 2019-01-22 23:06:00
8: Andy B 2019-02-06 00:20:00
9: Chris C 2019-02-15 23:16:00
10: Beth C 2019-01-29 06:00:00
#join by name and site using exact matches and by nearest time
df3 = df1[df2,on=.(name,site),nomatch=NULL][, diff:=abs(time1-time2)][order(name,site,diff),.SD[1],by=list(name,site)][,diff:=NULL]
> df3
name site time1 time2
1: Beth B 2019-03-02 09:23:00 2019-03-01 19:21:00
2: Beth C 2019-01-27 13:17:00 2019-01-29 06:00:00
3: Chris B 2019-01-09 17:16:00 2019-01-05 21:48:00
4: Chris C 2019-01-01 22:19:00 2019-01-07 21:47:00