落在给定时间段内的所有行的索引。对于许多这样的水桶。比许多调用findInterval更快?

时间:2017-12-22 00:23:47

标签: r dplyr data.table lubridate

考虑这个进入和退出时间矩阵:

entry_exit_times = structure(list(time_in = structure(c(1325552760, 1325555940, 
1325565540, 1325576820, 1325568060), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), time_out = structure(c(1325581320, 1325602980, 
1325621880, 1325582040, 1325577660), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), .Names = c("time_in", "time_out"), row.names = c(NA, 
-5L), class = "data.frame")

请注意,这些按time_in colunm。

的值排序
action_times = structure(list(index = 1:50, time_action = structure(c(1325552100, 
1325553360, 1325553720, 1325553900, 1325560260, 1325560860, 1325562780, 
1325563380, 1325564520, 1325569260, 1325569800, 1325570760, 1325576220, 
1325579700, 1325580960, 1325581440, 1325582340, 1325585580, 1325585700, 
1325586360, 1325586660, 1325588520, 1325588880, 1325591580, 1325595300, 
1325595900, 1325598960, 1325602380, 1325602560, 1325602740, 1325605020, 
1325605740, 1325608560, 1325609100, 1325610120, 1325611740, 1325613660, 
1325613840, 1325616240, 1325619600, 1325620620, 1325621640, 1325623380, 
1325624100, 1325626440, 1325627040, 1325627400, 1325627520, 1325628420, 
1325628660), class = c("POSIXct", "POSIXt"), tzone = "UTC")), .Names = c("index", 
"time"), row.names = c(NA, -50L), class = "data.frame")

请注意,这些按time colunm。

的值排序

现在,对于i的行entry_exit_times,我想找到 action_times行的所有索引都带有值 time之间的entry_exit_times$time_in[i]列 和entry_exit_times$time_out[i]

一种方法是:

output = matrix(NA, nrow(entry_exit_times), 2)

for(i in 1:nrow(output)){
output[i, ] = findInterval(c(entry_exit_times$time_in[i], entry_exit_times$time_out[i]), action_times$time, all.inside = TRUE)
}

但我的问题是:有没有办法让这个更有效率? nrow(entry_exit_times)非常大。

我还可以使用dplyr lubridatedata.table

编辑:

以下是较大样本的数据生成过程:

a_1 = seq(from=as.POSIXct("2012-1-3 0:00", tz="UTC"),
    to=as.POSIXct("2012-1-31 23:00", tz="UTC"),
    by="sec")  
n   = length(a_1) / 10
m   = length(a_1) / 100
a_2 = sort(sample(a_1, n))
a_3 = t(apply(matrix(sample(1:length(a_2), n), nc = 2), 1, sort))
entry_exit_times = data.frame(time_in = rep(NA, n), time_out = rep(NA, n))
entry_exit_times$time_in = a_2[a_3[, 1]]
entry_exit_times$time_out = a_2[a_3[, 2]]
time_action = data.frame(index = 1:m, time = sort(sample(a_1, m)))

编辑:建议解决方案的一些基准。

注意:运行这些需要一些ram。具体来说,在运行data.table解决方案时,我注意到RAM的使用率超过了32GB(这有点令人惊讶,因为dplyr没有任何这么大的RAM占用空间)。

library(dplyr)
library(lubridate)
library(data.table)


a_1 = seq(from=as.POSIXct("2012-01-01 0:00", tz="UTC"),
          to=as.POSIXct("2012-01-30 0:00", tz="UTC"),
          by="sec")  
n   = floor(length(a_1) / 100)
m   = floor(length(a_1) / 100)
a_2 = sort(sample(a_1, n))
a_3 = t(apply(matrix(sample(1:length(a_2), n), nc = 2), 1, sort))
entry_exit_times = data.frame(time_in = rep(NA, n), time_out = rep(NA, n))
entry_exit_times$time_in = a_2[a_3[, 1]]
entry_exit_times$time_out = a_2[a_3[, 2]]
time_action = data.frame(index = 1:m, time = sort(sample(a_1, m)))

fx01 <- function(entry_exit_times, action_times){
  above_in <- outer(entry_exit_times$time_in, 
                    action_times$time, 
                    function(e, a) e <= a)
  below_out <- outer(entry_exit_times$time_out, 
                     action_times$time, 
                     function(e, a) a <= e)
  apply(above_in & below_out, 1, which)
}

fx02 <- function(entry_exit_times, action_times){
  entry_exit_times <- entry_exit_times %>%
    mutate(
      entry_interval = interval(time_in, time_out)
    )

  time_action %>%
    filter(any(time %within% entry_exit_times$entry_interval)) %>%
    as_tibble
}

fx03 <- function(entry_exit_times, action_times){
  setDT(entry_exit_times)
  setDT(action_times)
  na.omit(action_times[entry_exit_times,
               on = .(time >= time_in, time <= time_out), allow.cartesian = TRUE])
}

system.time(fx01(entry_exit_times, time_action))

#   user  system elapsed 
# 17.160   4.068  21.226 

system.time(fx02(entry_exit_times, time_action))
#   user  system elapsed 
#  0.128   0.000   0.134
system.time(fx03(entry_exit_times, time_action))
#   user  system elapsed 
#  3.716   0.576   3.411 

3 个答案:

答案 0 :(得分:4)

这是一个非等连接问题:

library(data.table)

setDT(entry_exit_times)
setDT(action_times)

action_times[entry_exit_times,
             on = .(time >= time_in, time <= time_out), allow.cartesian = TRUE]
#    index                time              time.1
# 1:     2 2012-01-03 01:06:00 2012-01-03 09:02:00
# 2:     3 2012-01-03 01:06:00 2012-01-03 09:02:00
# 3:     4 2012-01-03 01:06:00 2012-01-03 09:02:00
# 4:     5 2012-01-03 01:06:00 2012-01-03 09:02:00
# 5:     6 2012-01-03 01:06:00 2012-01-03 09:02:00
# 6:     7 2012-01-03 01:06:00 2012-01-03 09:02:00
# 7:     8 2012-01-03 01:06:00 2012-01-03 09:02:00
# 8:     9 2012-01-03 01:06:00 2012-01-03 09:02:00
# 9:    10 2012-01-03 01:06:00 2012-01-03 09:02:00
# 10:    11 2012-01-03 01:06:00 2012-01-03 09:02:00
# 11:    12 2012-01-03 01:06:00 2012-01-03 09:02:00
# 12:    13 2012-01-03 01:06:00 2012-01-03 09:02:00
# 13:    14 2012-01-03 01:06:00 2012-01-03 09:02:00
# 14:    15 2012-01-03 01:06:00 2012-01-03 09:02:00
# 15:     5 2012-01-03 01:59:00 2012-01-03 15:03:00
# 16:     6 2012-01-03 01:59:00 2012-01-03 15:03:00
# 17:     7 2012-01-03 01:59:00 2012-01-03 15:03:00
# 18:     8 2012-01-03 01:59:00 2012-01-03 15:03:00
# 19:     9 2012-01-03 01:59:00 2012-01-03 15:03:00
# 20:    10 2012-01-03 01:59:00 2012-01-03 15:03:00
# 21:    11 2012-01-03 01:59:00 2012-01-03 15:03:00
# 22:    12 2012-01-03 01:59:00 2012-01-03 15:03:00
# 23:    13 2012-01-03 01:59:00 2012-01-03 15:03:00
# 24:    14 2012-01-03 01:59:00 2012-01-03 15:03:00
# 25:    15 2012-01-03 01:59:00 2012-01-03 15:03:00
# 26:    16 2012-01-03 01:59:00 2012-01-03 15:03:00
# 27:    17 2012-01-03 01:59:00 2012-01-03 15:03:00
# 28:    18 2012-01-03 01:59:00 2012-01-03 15:03:00
# 29:    19 2012-01-03 01:59:00 2012-01-03 15:03:00
# 30:    20 2012-01-03 01:59:00 2012-01-03 15:03:00
# 31:    21 2012-01-03 01:59:00 2012-01-03 15:03:00
# 32:    22 2012-01-03 01:59:00 2012-01-03 15:03:00
# 33:    23 2012-01-03 01:59:00 2012-01-03 15:03:00
# 34:    24 2012-01-03 01:59:00 2012-01-03 15:03:00
# 35:    25 2012-01-03 01:59:00 2012-01-03 15:03:00
# 36:    26 2012-01-03 01:59:00 2012-01-03 15:03:00
# 37:    27 2012-01-03 01:59:00 2012-01-03 15:03:00
# 38:    28 2012-01-03 01:59:00 2012-01-03 15:03:00
# 39:    29 2012-01-03 01:59:00 2012-01-03 15:03:00
# 40:    30 2012-01-03 01:59:00 2012-01-03 15:03:00
# 41:    10 2012-01-03 04:39:00 2012-01-03 20:18:00
# 42:    11 2012-01-03 04:39:00 2012-01-03 20:18:00
# 43:    12 2012-01-03 04:39:00 2012-01-03 20:18:00
# 44:    13 2012-01-03 04:39:00 2012-01-03 20:18:00
# 45:    14 2012-01-03 04:39:00 2012-01-03 20:18:00
# 46:    15 2012-01-03 04:39:00 2012-01-03 20:18:00
# 47:    16 2012-01-03 04:39:00 2012-01-03 20:18:00
# 48:    17 2012-01-03 04:39:00 2012-01-03 20:18:00
# 49:    18 2012-01-03 04:39:00 2012-01-03 20:18:00
# 50:    19 2012-01-03 04:39:00 2012-01-03 20:18:00
# 51:    20 2012-01-03 04:39:00 2012-01-03 20:18:00
# 52:    21 2012-01-03 04:39:00 2012-01-03 20:18:00
# 53:    22 2012-01-03 04:39:00 2012-01-03 20:18:00
# 54:    23 2012-01-03 04:39:00 2012-01-03 20:18:00
# 55:    24 2012-01-03 04:39:00 2012-01-03 20:18:00
# 56:    25 2012-01-03 04:39:00 2012-01-03 20:18:00
# 57:    26 2012-01-03 04:39:00 2012-01-03 20:18:00
# 58:    27 2012-01-03 04:39:00 2012-01-03 20:18:00
# 59:    28 2012-01-03 04:39:00 2012-01-03 20:18:00
# 60:    29 2012-01-03 04:39:00 2012-01-03 20:18:00
# 61:    30 2012-01-03 04:39:00 2012-01-03 20:18:00
# 62:    31 2012-01-03 04:39:00 2012-01-03 20:18:00
# 63:    32 2012-01-03 04:39:00 2012-01-03 20:18:00
# 64:    33 2012-01-03 04:39:00 2012-01-03 20:18:00
# 65:    34 2012-01-03 04:39:00 2012-01-03 20:18:00
# 66:    35 2012-01-03 04:39:00 2012-01-03 20:18:00
# 67:    36 2012-01-03 04:39:00 2012-01-03 20:18:00
# 68:    37 2012-01-03 04:39:00 2012-01-03 20:18:00
# 69:    38 2012-01-03 04:39:00 2012-01-03 20:18:00
# 70:    39 2012-01-03 04:39:00 2012-01-03 20:18:00
# 71:    40 2012-01-03 04:39:00 2012-01-03 20:18:00
# 72:    41 2012-01-03 04:39:00 2012-01-03 20:18:00
# 73:    42 2012-01-03 04:39:00 2012-01-03 20:18:00
# 74:    14 2012-01-03 07:47:00 2012-01-03 09:14:00
# 75:    15 2012-01-03 07:47:00 2012-01-03 09:14:00
# 76:    16 2012-01-03 07:47:00 2012-01-03 09:14:00
# 77:    10 2012-01-03 05:21:00 2012-01-03 08:01:00
# 78:    11 2012-01-03 05:21:00 2012-01-03 08:01:00
# 79:    12 2012-01-03 05:21:00 2012-01-03 08:01:00
# 80:    13 2012-01-03 05:21:00 2012-01-03 08:01:00
# index                time              time.1

此处,timetime.1分别为in_timeout_time

以下是具有较大数据集的基准:

system.time({
    setDT(entry_exit_times)
    setDT(time_action)

    time_action[entry_exit_times,
                 on = .(time >= time_in, time <= time_out), allow.cartesian = TRUE]
})

#    user  system elapsed 
#  26.893  15.950  42.893

速度可以接受,但需要大量内存(最多可达60G)。

答案 1 :(得分:1)

我不知道这是否可行;这取决于“非常大”的含义。但这是一种没有循环的方法。找出哪些操作都高于time_in且低于time_out

above_in <- outer(entry_exit_times$time_in, 
                  action_times$time, 
                  function(e, a) e <= a)
below_out <- outer(entry_exit_times$time_out, 
                  action_times$time, 
                  function(e, a) a <= e)
apply(above_in & below_out, 1, which)

根据您提供的数据

[[1]]
 [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15

[[2]]
 [1]  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

[[3]]
 [1] 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42

[[4]]
[1] 14 15 16

[[5]]
[1] 10 11 12 13

答案 2 :(得分:1)

我会使用lubridate::interval来组合开始和结束时间,并使用同一个包中的%within%函数进行搜索。与大多数 rowwise 操作一样,性能受到影响,在这种情况下我使用sapply来构造逻辑向量以传递给filtermt1022data.table解决方案将是一个优秀的表现者,但是,考虑到样本数据的规模,这种方法仍然是合理的。

library(dplyr)
library(lubridate)

entry_exit_times <- entry_exit_times %>%
  mutate(
    entry_interval = interval(time_in, time_out)
  )

time_action %>%
  as_tibble %>%
  mutate(
    keep = sapply(time, function(x) {

      any(x %within% entry_exit_times$entry_interval)

    })
  ) %>%
  filter(keep) %>%
  select(-keep)

# # A tibble: 25,019 x 2
#    index                time
#    <dbl>              <dttm>
#  1     2 2012-01-03 00:01:28
#  2     3 2012-01-03 00:04:15
#  3     4 2012-01-03 00:07:05
#  4     5 2012-01-03 00:08:18
#  5     6 2012-01-03 00:08:37
#  6     7 2012-01-03 00:10:56
#  7     8 2012-01-03 00:11:02
#  8     9 2012-01-03 00:12:28
#  9    10 2012-01-03 00:13:22
# 10    11 2012-01-03 00:15:54
# # ... with 25,009 more rows