我有两个表,我希望以类似于以下SQL的方式连接在一起,在这里我加入多个条件,而不仅仅是相等。
require(sqldf)
require(data.table)
dt <- data.table(num=c(1, 2, 3, 4, 5, 6),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_out_sql <- sqldf('
select dtone.num, dtone.char, dtone.bool, SUM(dttwo.num) as SUM,
MIN(dttwo.num) as MIN
from dt as dtone INNER join dt_two as dttwo on
(dtone.char = dttwo.char) and
(dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool')
出于性能和灵活性的原因,我想避免SQL解决方案。进行交叉连接,然后过滤/聚合也是如此 - 它会创建一个包含大量不必要记录的中间表,以便我过滤掉。
非常感谢!
更新 - 我最初的例子是在急速完成的。在我的实际问题中,我没有自我加入。
答案 0 :(得分:9)
以这种方式:
require(data.table)
setkey(dt, char)
setkey(dt_two, char)
dt_two[dt, {
val = num[i.bool | i.num >= num];
list(num=i.num, bool=i.bool, sum=sum(val), min=min(val))
}, by=.EACHI]
# char num bool sum min
# 1: A 1 TRUE 12 1
# 2: A 2 FALSE 1 1
# 3: A 3 TRUE 12 1
# 4: B 4 FALSE 9 2
# 5: B 5 TRUE 9 2
# 6: B 6 FALSE 9 2
要阅读by=.EACHI
,请查看this post(直到联接的插图完成为止)。
HTH
答案 1 :(得分:5)
它有点丑陋但有效:
library(data.table)
library(sqldf)
dt <- data.table(num=c(1, 2, 3, 4, 5, 6),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3),
char=c('A', 'A', 'A', 'B', 'B', 'B'),
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))
dt_out_sql <- sqldf('
select dtone.num,
dtone.char,
dtone.bool,
SUM(dttwo.num) as SUM,
MIN(dttwo.num) as MIN
from dt as dtone
INNER join dt_two as dttwo on
(dtone.char = dttwo.char) and
(dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool
')
setDT(dt_out_sql)
setkey(dt, char)
setkey(dt_two, char)
dt_out_r <- dt[dt_two,
list(dtone.num = num,
dttwo.num = i.num,
char,
bool) ,
nomatch = 0, allow.cartesian = T
][
dtone.num >= dttwo.num | bool,
list(SUM = sum(dttwo.num),
MIN = min(dttwo.num)),
by = list(num = dtone.num,
char,
bool)
]
setkey(dt_out_r, num, char, bool)
all.equal(dt_out_sql, dt_out_r, check.attributes = FALSE)
答案 2 :(得分:-2)
从data.table
1.9.8开始,对于可以放宽连接条件的情况,可以使用简单的非等连接语法:
dt_two[dt, on=.(char, num >= num)]