条件(不等式)连接data.table

时间:2014-03-27 07:13:04

标签: r data.table

我只想弄清楚如何在两个data.tables上进行条件连接。

dt2

dt2

我写了一个sqldf条件连接来给我开始或结束时间在另一个开始/结束时间内的电路。

sqldf("select dt2.start, dt2.finish, dt2.counts, dt1.id, dt1.circuit 
      from dt2 
        left join dt1 on (
          (dt2.start  >= dt1.start and dt2.start  < dt1.finish) or 
          (dt2.finish >= dt1.start and dt2.finish < dt1.finish)
        )")

result

这给了我正确的结果,但对于我的大数据集来说它太慢了。

没有矢量扫描,data.table这样做的方法是什么?

这是我的数据:

dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L, 
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200, 
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000, 
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""), 
    end = structure(c(1393626600, 1393631519, 1393639200, 1393632000, 
    1393660500, 1393673400, 1393667999, 1393671600, 1393677000
    ), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001", 
    "1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009"
    ), class = "factor")), .Names = c("circuit", "start", "end", 
"id"), class = "data.frame", row.names = c(NA, -9L)))


dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct", 
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L, 
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L, 
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L, 
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end", 
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame"))

2 个答案:

答案 0 :(得分:9)

使用非equi 加入:

ans = dt1[dt2, on=.(start <= end, end > start), 
           .(i.start, i.end, counts, id, circuit, cndn = i.start < x.start & i.end >= x.end), 
           allow.cartesian=TRUE
        ][!cndn %in% TRUE]

条件start <= end, end >= start(注意两个情况下的>=)将检查两个间隔是否以任何方式重叠。一方的开放区间由end > start部分(>而不是>=)完成。但它仍然会选择类型的间隔:

         dt1: start=================end
   dt2: start--------------------------------end ## start < start, end > end

         dt1: start=================end
                dt2: start----------end          ## end == end

cndn列是检查并删除这些案例。希望这些案例不是很多,所以我们不会不必要地实现不需要的行。

PS:这种情况下的解决方案并不像我想的那样简单,这是因为解决方案需要OR操作。可以执行两个条件连接,然后将它们绑定在一起。

也许在某些时候,我们必须考虑以更直接的方式将连接扩展到这些类型的操作的可行性。

答案 1 :(得分:4)

不知道这是否表现得更快,但这是一个数据表方法的镜头。我重塑dt1并使用findInterval来确定dt2中的时间与dt1中的时间对齐。

dt1 <- data.table(structure(list(circuit = structure(c(2L, 1L, 2L, 1L, 2L, 3L, 
1L, 1L, 2L), .Label = c("a", "b", "c"), class = "factor"), start = structure(c(1393621200, 
1393627920, 1393628400, 1393631520, 1393650300, 1393646400, 1393656000, 
1393668000, 1393666200), class = c("POSIXct", "POSIXt"), tzone = ""), 
    end = structure(c(1393626600, 1393631519, 1393639200, 1393632000, 
    1393660500, 1393673400, 1393667999, 1393671600, 1393677000
    ), class = c("POSIXct", "POSIXt"), tzone = ""), id = structure(1:9, .Label = c("1001", 
    "1002", "1003", "1004", "1005", "1006", "1007", "1008", "1009"
    ), class = "factor")), .Names = c("circuit", "start", "end", 
"id"), class = "data.frame", row.names = c(NA, -9L)))

dt2 <- data.table(structure(list(start = structure(c(1393621200, 1393624800, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400), class = c("POSIXct", 
"POSIXt"), tzone = ""), end = structure(c(1393624799, 1393626600, 
1393627919, 1393628399, 1393632000, 1393639200, 1393646399, 1393650299, 
1393655999, 1393660500, 1393666199, 1393671600, 1393673400, 1393677000
), class = c("POSIXct", "POSIXt"), tzone = ""), seconds = c(3599L, 
1800L, 1319L, 480L, 3601L, 7200L, 7199L, 3900L, 5700L, 4501L, 
5699L, 5401L, 1800L, 3600L), counts = c(1L, 1L, 0L, 1L, 2L, 1L, 
0L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), .Names = c("start", "end", 
"seconds", "counts"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 11L, 12L, 13L, 14L, 15L), class = "data.frame"))

# > dt1
   # circuit               start                 end   id
# 1:       b 2014-02-28 16:00:00 2014-02-28 17:30:00 1001
# 2:       a 2014-02-28 17:52:00 2014-02-28 18:51:59 1002
# 3:       b 2014-02-28 18:00:00 2014-02-28 21:00:00 1003
# 4:       a 2014-02-28 18:52:00 2014-02-28 19:00:00 1004
# 5:       b 2014-03-01 00:05:00 2014-03-01 02:55:00 1005
# 6:       c 2014-02-28 23:00:00 2014-03-01 06:30:00 1006
# 7:       a 2014-03-01 01:40:00 2014-03-01 04:59:59 1007
# 8:       a 2014-03-01 05:00:00 2014-03-01 06:00:00 1008
# 9:       b 2014-03-01 04:30:00 2014-03-01 07:30:00 1009

# > dt2
                  # start                 end seconds counts
 # 1: 2014-02-28 16:00:00 2014-02-28 16:59:59    3599      1
 # 2: 2014-02-28 17:00:00 2014-02-28 17:30:00    1800      1
 # 3: 2014-02-28 17:30:00 2014-02-28 17:51:59    1319      0
 # 4: 2014-02-28 17:51:59 2014-02-28 17:59:59     480      1
 # 5: 2014-02-28 17:59:59 2014-02-28 19:00:00    3601      2
 # 6: 2014-02-28 19:00:00 2014-02-28 21:00:00    7200      1
 # 7: 2014-02-28 21:00:00 2014-02-28 22:59:59    7199      0
 # 8: 2014-02-28 22:59:59 2014-03-01 00:04:59    3900      1
 # 9: 2014-03-01 00:04:59 2014-03-01 01:39:59    5700      2
# 10: 2014-03-01 01:39:59 2014-03-01 02:55:00    4501      3
# 11: 2014-03-01 02:55:00 2014-03-01 04:29:59    5699      2
# 12: 2014-03-01 04:29:59 2014-03-01 06:00:00    5401      3
# 13: 2014-03-01 06:00:00 2014-03-01 06:30:00    1800      2
# 14: 2014-03-01 06:30:00 2014-03-01 07:30:00    3600      1

## reshapes dt1 from wide to long
## puts start and end times into one column and sorts by time
## this is so that you can use findInterval later
dt3 <- dt1[,list(time = c(start,end)), by = "circuit,id"][order(time)]
dt3[,ntvl := seq_len(nrow(dt3))]
    # circuit   id                time ntvl
 # 1:       b 1001 2014-02-28 16:00:00    1
 # 2:       b 1001 2014-02-28 17:30:00    2
 # 3:       a 1002 2014-02-28 17:52:00    3
 # 4:       b 1003 2014-02-28 18:00:00    4
 # 5:       a 1002 2014-02-28 18:51:59    5
 # 6:       a 1004 2014-02-28 18:52:00    6
 # 7:       a 1004 2014-02-28 19:00:00    7
 # 8:       b 1003 2014-02-28 21:00:00    8
 # 9:       c 1006 2014-02-28 23:00:00    9
# 10:       b 1005 2014-03-01 00:05:00   10
# 11:       a 1007 2014-03-01 01:40:00   11
# 12:       b 1005 2014-03-01 02:55:00   12
# 13:       b 1009 2014-03-01 04:30:00   13
# 14:       a 1007 2014-03-01 04:59:59   14
# 15:       a 1008 2014-03-01 05:00:00   15
# 16:       a 1008 2014-03-01 06:00:00   16
# 17:       c 1006 2014-03-01 06:30:00   17
# 18:       b 1009 2014-03-01 07:30:00   18

## map interval to id
dt4 <- dt3[,list(ntvl = seq(from = min(ntvl), to = max(ntvl)-1), by = 1),by = "circuit,id"]
setkey(dt4, ntvl)
    # circuit   id ntvl
 # 1:       b 1001    1
 # 2:       a 1002    3
 # 3:       a 1002    4
 # 4:       b 1003    4
 # 5:       b 1003    5
 # 6:       b 1003    6
 # 7:       a 1004    6
 # 8:       b 1003    7
 # 9:       c 1006    9
# 10:       c 1006   10
# 11:       b 1005   10
# 12:       c 1006   11
# 13:       b 1005   11
# 14:       a 1007   11
# 15:       c 1006   12
# 16:       a 1007   12
# 17:       c 1006   13
# 18:       a 1007   13
# 19:       b 1009   13
# 20:       c 1006   14
# 21:       b 1009   14
# 22:       c 1006   15
# 23:       b 1009   15
# 24:       a 1008   15
# 25:       c 1006   16
# 26:       b 1009   16
# 27:       b 1009   17
    # circuit   id ntvl

## finds intervals in dt2
dt2[,`:=`(ntvl_start = findInterval(start, dt3[["time"]], rightmost.closed = FALSE),
    ntvl_end = findInterval(end, dt3[["time"]], rightmost.closed = FALSE))]
                  # start                 end seconds counts ntvl_start ntvl_end
 # 1: 2014-02-28 16:00:00 2014-02-28 16:59:59    3599      1          1        1
 # 2: 2014-02-28 17:00:00 2014-02-28 17:30:00    1800      1          1        2
 # 3: 2014-02-28 17:30:00 2014-02-28 17:51:59    1319      0          2        2
 # 4: 2014-02-28 17:51:59 2014-02-28 17:59:59     480      1          2        3
 # 5: 2014-02-28 17:59:59 2014-02-28 19:00:00    3601      2          3        7
 # 6: 2014-02-28 19:00:00 2014-02-28 21:00:00    7200      1          7        8
 # 7: 2014-02-28 21:00:00 2014-02-28 22:59:59    7199      0          8        8
 # 8: 2014-02-28 22:59:59 2014-03-01 00:04:59    3900      1          8        9
 # 9: 2014-03-01 00:04:59 2014-03-01 01:39:59    5700      2          9       10
# 10: 2014-03-01 01:39:59 2014-03-01 02:55:00    4501      3         10       12
# 11: 2014-03-01 02:55:00 2014-03-01 04:29:59    5699      2         12       12
# 12: 2014-03-01 04:29:59 2014-03-01 06:00:00    5401      3         12       16
# 13: 2014-03-01 06:00:00 2014-03-01 06:30:00    1800      2         16       17
# 14: 2014-03-01 06:30:00 2014-03-01 07:30:00    3600      1         17       18

## joins, by start time, then by end time
## the commented out lines may be a better alternative
## if there are many NA values
setkey(dt2, ntvl_start)
dt_ans_start <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = NA]
# dt_ans_start <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = 0]
# dt_ans_start_na <- dt2[!dt4]
setkey(dt2, ntvl_end)
dt_ans_end <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = NA]
# dt_ans_end <- dt4[dt2, list(start,end,counts,id,circuit),nomatch = 0]
# dt_ans_end_na <- dt2[!dt4]

## bring them all together and remove duplicates
dt_ans <- unique(rbind(dt_ans_start, dt_ans_end), by = c("start", "id"))
dt_ans <- dt_ans[!(is.na(id) & counts > 0)]
dt_ans[,ntvl := NULL]
setkey(dt_ans,start)
                  # start                 end counts   id circuit
 # 1: 2014-02-28 16:00:00 2014-02-28 16:59:59      1 1001       b
 # 2: 2014-02-28 17:00:00 2014-02-28 17:30:00      1 1001       b
 # 3: 2014-02-28 17:30:00 2014-02-28 17:51:59      0   NA      NA
 # 4: 2014-02-28 17:51:59 2014-02-28 17:59:59      1 1002       a
 # 5: 2014-02-28 17:59:59 2014-02-28 19:00:00      2 1002       a
 # 6: 2014-02-28 17:59:59 2014-02-28 19:00:00      2 1003       b
 # 7: 2014-02-28 19:00:00 2014-02-28 21:00:00      1 1003       b
 # 8: 2014-02-28 21:00:00 2014-02-28 22:59:59      0   NA      NA
 # 9: 2014-02-28 22:59:59 2014-03-01 00:04:59      1 1006       c
# 10: 2014-03-01 00:04:59 2014-03-01 01:39:59      2 1006       c
# 11: 2014-03-01 00:04:59 2014-03-01 01:39:59      2 1005       b
# 12: 2014-03-01 01:39:59 2014-03-01 02:55:00      3 1006       c
# 13: 2014-03-01 01:39:59 2014-03-01 02:55:00      3 1005       b
# 14: 2014-03-01 01:39:59 2014-03-01 02:55:00      3 1007       a
# 15: 2014-03-01 02:55:00 2014-03-01 04:29:59      2 1006       c
# 16: 2014-03-01 02:55:00 2014-03-01 04:29:59      2 1007       a
# 17: 2014-03-01 04:29:59 2014-03-01 06:00:00      3 1006       c
# 18: 2014-03-01 04:29:59 2014-03-01 06:00:00      3 1007       a
# 19: 2014-03-01 04:29:59 2014-03-01 06:00:00      3 1009       b
# 20: 2014-03-01 06:00:00 2014-03-01 06:30:00      2 1006       c
# 21: 2014-03-01 06:00:00 2014-03-01 06:30:00      2 1009       b
# 22: 2014-03-01 06:30:00 2014-03-01 07:30:00      1 1009       b
                  # start                 end counts   id circuit