R Data.Table加入条件

时间:2015-02-26 16:46:44

标签: r data.table

我有两个表,我希望以类似于以下SQL的方式连接在一起,在这里我加入多个条件,而不仅仅是相等。

require(sqldf)
require(data.table)

dt <- data.table(num=c(1, 2, 3, 4, 5, 6), 
char=c('A', 'A', 'A', 'B', 'B', 'B'), 
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))

dt_two <- data.table(
num =c(6, 1, 5, 2, 4, 3), 
char=c('A', 'A', 'A', 'B', 'B', 'B'), 
bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))


dt_out_sql <- sqldf('
    select dtone.num, dtone.char, dtone.bool, SUM(dttwo.num) as SUM,  
   MIN(dttwo.num) as MIN
    from dt as dtone INNER join dt_two as dttwo on 
    (dtone.char = dttwo.char) and 
    (dtone.num >= dttwo.num OR dtone.bool)
GROUP BY dtone.num, dtone.char, dtone.bool')

出于性能和灵活性的原因,我想避免SQL解决方案。进行交叉连接,然后过滤/聚合也是如此 - 它会创建一个包含大量不必要记录的中间表,以便我过滤掉。

非常感谢!

更新 - 我最初的例子是在急速完成的。在我的实际问题中,我没有自我加入。

3 个答案:

答案 0 :(得分:9)

以这种方式:

require(data.table)
setkey(dt, char)
setkey(dt_two, char)

dt_two[dt, {
   val = num[i.bool | i.num >= num]; 
   list(num=i.num, bool=i.bool, sum=sum(val), min=min(val))
}, by=.EACHI]
#    char num  bool sum min
# 1:    A   1  TRUE  12   1
# 2:    A   2 FALSE   1   1
# 3:    A   3  TRUE  12   1
# 4:    B   4 FALSE   9   2
# 5:    B   5  TRUE   9   2
# 6:    B   6 FALSE   9   2

要阅读by=.EACHI,请查看this post(直到联接的插图完成为止)。

HTH

答案 1 :(得分:5)

它有点丑陋但有效:

library(data.table)
library(sqldf)

dt <- data.table(num=c(1, 2, 3, 4, 5, 6), 
                 char=c('A', 'A', 'A', 'B', 'B', 'B'), 
                 bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))

dt_two <- data.table(
  num =c(6, 1, 5, 2, 4, 3), 
  char=c('A', 'A', 'A', 'B', 'B', 'B'), 
  bool=c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE))


dt_out_sql <- sqldf('
    select dtone.num,
            dtone.char,
            dtone.bool,
            SUM(dttwo.num) as SUM,  
            MIN(dttwo.num) as MIN
    from    dt as dtone
    INNER join dt_two as dttwo on 
          (dtone.char = dttwo.char) and 
          (dtone.num >= dttwo.num OR dtone.bool)
    GROUP BY dtone.num, dtone.char, dtone.bool
  ')

setDT(dt_out_sql)

setkey(dt, char)
setkey(dt_two, char)

dt_out_r <- dt[dt_two,
               list(dtone.num = num,
                    dttwo.num = i.num,
                    char,
                    bool) ,
               nomatch = 0, allow.cartesian = T
               ][
                 dtone.num >= dttwo.num | bool,
                 list(SUM = sum(dttwo.num),
                      MIN = min(dttwo.num)),
                 by = list(num = dtone.num,
                           char,
                           bool)
                 ]

setkey(dt_out_r, num, char, bool)


all.equal(dt_out_sql, dt_out_r, check.attributes = FALSE)

答案 2 :(得分:-2)

data.table 1.9.8开始,对于可以放宽连接条件的情况,可以使用简单的非等连接语法:

dt_two[dt, on=.(char, num >= num)]