考虑这个进入和退出时间矩阵:
entry_exit_times = structure(list(time_in = structure(c(1325552760, 1325555940,
1325565540, 1325576820, 1325568060), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), time_out = structure(c(1325581320, 1325602980,
1325621880, 1325582040, 1325577660), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), .Names = c("time_in", "time_out"), row.names = c(NA,
-5L), class = "data.frame")
请注意,这些按time_in
colunm。
action_times = structure(list(index = 1:50, time_action = structure(c(1325552100,
1325553360, 1325553720, 1325553900, 1325560260, 1325560860, 1325562780,
1325563380, 1325564520, 1325569260, 1325569800, 1325570760, 1325576220,
1325579700, 1325580960, 1325581440, 1325582340, 1325585580, 1325585700,
1325586360, 1325586660, 1325588520, 1325588880, 1325591580, 1325595300,
1325595900, 1325598960, 1325602380, 1325602560, 1325602740, 1325605020,
1325605740, 1325608560, 1325609100, 1325610120, 1325611740, 1325613660,
1325613840, 1325616240, 1325619600, 1325620620, 1325621640, 1325623380,
1325624100, 1325626440, 1325627040, 1325627400, 1325627520, 1325628420,
1325628660), class = c("POSIXct", "POSIXt"), tzone = "UTC")), .Names = c("index",
"time"), row.names = c(NA, -50L), class = "data.frame")
请注意,这些按time
colunm。
现在,对于i
的行entry_exit_times
,我想找到
action_times
行的所有索引都带有值
time
之间的entry_exit_times$time_in[i]
列
和entry_exit_times$time_out[i]
。
一种方法是:
output = matrix(NA, nrow(entry_exit_times), 2)
for(i in 1:nrow(output)){
output[i, ] = findInterval(c(entry_exit_times$time_in[i], entry_exit_times$time_out[i]), action_times$time, all.inside = TRUE)
}
但我的问题是:有没有办法让这个更有效率?
nrow(entry_exit_times)
非常大。
我还可以使用dplyr
lubridate
和data.table
。
以下是较大样本的数据生成过程:
a_1 = seq(from=as.POSIXct("2012-1-3 0:00", tz="UTC"),
to=as.POSIXct("2012-1-31 23:00", tz="UTC"),
by="sec")
n = length(a_1) / 10
m = length(a_1) / 100
a_2 = sort(sample(a_1, n))
a_3 = t(apply(matrix(sample(1:length(a_2), n), nc = 2), 1, sort))
entry_exit_times = data.frame(time_in = rep(NA, n), time_out = rep(NA, n))
entry_exit_times$time_in = a_2[a_3[, 1]]
entry_exit_times$time_out = a_2[a_3[, 2]]
time_action = data.frame(index = 1:m, time = sort(sample(a_1, m)))
注意:运行这些需要一些ram。具体来说,在运行data.table解决方案时,我注意到RAM的使用率超过了32GB(这有点令人惊讶,因为dplyr没有任何这么大的RAM占用空间)。
library(dplyr)
library(lubridate)
library(data.table)
a_1 = seq(from=as.POSIXct("2012-01-01 0:00", tz="UTC"),
to=as.POSIXct("2012-01-30 0:00", tz="UTC"),
by="sec")
n = floor(length(a_1) / 100)
m = floor(length(a_1) / 100)
a_2 = sort(sample(a_1, n))
a_3 = t(apply(matrix(sample(1:length(a_2), n), nc = 2), 1, sort))
entry_exit_times = data.frame(time_in = rep(NA, n), time_out = rep(NA, n))
entry_exit_times$time_in = a_2[a_3[, 1]]
entry_exit_times$time_out = a_2[a_3[, 2]]
time_action = data.frame(index = 1:m, time = sort(sample(a_1, m)))
fx01 <- function(entry_exit_times, action_times){
above_in <- outer(entry_exit_times$time_in,
action_times$time,
function(e, a) e <= a)
below_out <- outer(entry_exit_times$time_out,
action_times$time,
function(e, a) a <= e)
apply(above_in & below_out, 1, which)
}
fx02 <- function(entry_exit_times, action_times){
entry_exit_times <- entry_exit_times %>%
mutate(
entry_interval = interval(time_in, time_out)
)
time_action %>%
filter(any(time %within% entry_exit_times$entry_interval)) %>%
as_tibble
}
fx03 <- function(entry_exit_times, action_times){
setDT(entry_exit_times)
setDT(action_times)
na.omit(action_times[entry_exit_times,
on = .(time >= time_in, time <= time_out), allow.cartesian = TRUE])
}
system.time(fx01(entry_exit_times, time_action))
# user system elapsed
# 17.160 4.068 21.226
system.time(fx02(entry_exit_times, time_action))
# user system elapsed
# 0.128 0.000 0.134
system.time(fx03(entry_exit_times, time_action))
# user system elapsed
# 3.716 0.576 3.411
答案 0 :(得分:4)
这是一个非等连接问题:
library(data.table)
setDT(entry_exit_times)
setDT(action_times)
action_times[entry_exit_times,
on = .(time >= time_in, time <= time_out), allow.cartesian = TRUE]
# index time time.1
# 1: 2 2012-01-03 01:06:00 2012-01-03 09:02:00
# 2: 3 2012-01-03 01:06:00 2012-01-03 09:02:00
# 3: 4 2012-01-03 01:06:00 2012-01-03 09:02:00
# 4: 5 2012-01-03 01:06:00 2012-01-03 09:02:00
# 5: 6 2012-01-03 01:06:00 2012-01-03 09:02:00
# 6: 7 2012-01-03 01:06:00 2012-01-03 09:02:00
# 7: 8 2012-01-03 01:06:00 2012-01-03 09:02:00
# 8: 9 2012-01-03 01:06:00 2012-01-03 09:02:00
# 9: 10 2012-01-03 01:06:00 2012-01-03 09:02:00
# 10: 11 2012-01-03 01:06:00 2012-01-03 09:02:00
# 11: 12 2012-01-03 01:06:00 2012-01-03 09:02:00
# 12: 13 2012-01-03 01:06:00 2012-01-03 09:02:00
# 13: 14 2012-01-03 01:06:00 2012-01-03 09:02:00
# 14: 15 2012-01-03 01:06:00 2012-01-03 09:02:00
# 15: 5 2012-01-03 01:59:00 2012-01-03 15:03:00
# 16: 6 2012-01-03 01:59:00 2012-01-03 15:03:00
# 17: 7 2012-01-03 01:59:00 2012-01-03 15:03:00
# 18: 8 2012-01-03 01:59:00 2012-01-03 15:03:00
# 19: 9 2012-01-03 01:59:00 2012-01-03 15:03:00
# 20: 10 2012-01-03 01:59:00 2012-01-03 15:03:00
# 21: 11 2012-01-03 01:59:00 2012-01-03 15:03:00
# 22: 12 2012-01-03 01:59:00 2012-01-03 15:03:00
# 23: 13 2012-01-03 01:59:00 2012-01-03 15:03:00
# 24: 14 2012-01-03 01:59:00 2012-01-03 15:03:00
# 25: 15 2012-01-03 01:59:00 2012-01-03 15:03:00
# 26: 16 2012-01-03 01:59:00 2012-01-03 15:03:00
# 27: 17 2012-01-03 01:59:00 2012-01-03 15:03:00
# 28: 18 2012-01-03 01:59:00 2012-01-03 15:03:00
# 29: 19 2012-01-03 01:59:00 2012-01-03 15:03:00
# 30: 20 2012-01-03 01:59:00 2012-01-03 15:03:00
# 31: 21 2012-01-03 01:59:00 2012-01-03 15:03:00
# 32: 22 2012-01-03 01:59:00 2012-01-03 15:03:00
# 33: 23 2012-01-03 01:59:00 2012-01-03 15:03:00
# 34: 24 2012-01-03 01:59:00 2012-01-03 15:03:00
# 35: 25 2012-01-03 01:59:00 2012-01-03 15:03:00
# 36: 26 2012-01-03 01:59:00 2012-01-03 15:03:00
# 37: 27 2012-01-03 01:59:00 2012-01-03 15:03:00
# 38: 28 2012-01-03 01:59:00 2012-01-03 15:03:00
# 39: 29 2012-01-03 01:59:00 2012-01-03 15:03:00
# 40: 30 2012-01-03 01:59:00 2012-01-03 15:03:00
# 41: 10 2012-01-03 04:39:00 2012-01-03 20:18:00
# 42: 11 2012-01-03 04:39:00 2012-01-03 20:18:00
# 43: 12 2012-01-03 04:39:00 2012-01-03 20:18:00
# 44: 13 2012-01-03 04:39:00 2012-01-03 20:18:00
# 45: 14 2012-01-03 04:39:00 2012-01-03 20:18:00
# 46: 15 2012-01-03 04:39:00 2012-01-03 20:18:00
# 47: 16 2012-01-03 04:39:00 2012-01-03 20:18:00
# 48: 17 2012-01-03 04:39:00 2012-01-03 20:18:00
# 49: 18 2012-01-03 04:39:00 2012-01-03 20:18:00
# 50: 19 2012-01-03 04:39:00 2012-01-03 20:18:00
# 51: 20 2012-01-03 04:39:00 2012-01-03 20:18:00
# 52: 21 2012-01-03 04:39:00 2012-01-03 20:18:00
# 53: 22 2012-01-03 04:39:00 2012-01-03 20:18:00
# 54: 23 2012-01-03 04:39:00 2012-01-03 20:18:00
# 55: 24 2012-01-03 04:39:00 2012-01-03 20:18:00
# 56: 25 2012-01-03 04:39:00 2012-01-03 20:18:00
# 57: 26 2012-01-03 04:39:00 2012-01-03 20:18:00
# 58: 27 2012-01-03 04:39:00 2012-01-03 20:18:00
# 59: 28 2012-01-03 04:39:00 2012-01-03 20:18:00
# 60: 29 2012-01-03 04:39:00 2012-01-03 20:18:00
# 61: 30 2012-01-03 04:39:00 2012-01-03 20:18:00
# 62: 31 2012-01-03 04:39:00 2012-01-03 20:18:00
# 63: 32 2012-01-03 04:39:00 2012-01-03 20:18:00
# 64: 33 2012-01-03 04:39:00 2012-01-03 20:18:00
# 65: 34 2012-01-03 04:39:00 2012-01-03 20:18:00
# 66: 35 2012-01-03 04:39:00 2012-01-03 20:18:00
# 67: 36 2012-01-03 04:39:00 2012-01-03 20:18:00
# 68: 37 2012-01-03 04:39:00 2012-01-03 20:18:00
# 69: 38 2012-01-03 04:39:00 2012-01-03 20:18:00
# 70: 39 2012-01-03 04:39:00 2012-01-03 20:18:00
# 71: 40 2012-01-03 04:39:00 2012-01-03 20:18:00
# 72: 41 2012-01-03 04:39:00 2012-01-03 20:18:00
# 73: 42 2012-01-03 04:39:00 2012-01-03 20:18:00
# 74: 14 2012-01-03 07:47:00 2012-01-03 09:14:00
# 75: 15 2012-01-03 07:47:00 2012-01-03 09:14:00
# 76: 16 2012-01-03 07:47:00 2012-01-03 09:14:00
# 77: 10 2012-01-03 05:21:00 2012-01-03 08:01:00
# 78: 11 2012-01-03 05:21:00 2012-01-03 08:01:00
# 79: 12 2012-01-03 05:21:00 2012-01-03 08:01:00
# 80: 13 2012-01-03 05:21:00 2012-01-03 08:01:00
# index time time.1
此处,time
和time.1
分别为in_time
和out_time
。
以下是具有较大数据集的基准:
system.time({
setDT(entry_exit_times)
setDT(time_action)
time_action[entry_exit_times,
on = .(time >= time_in, time <= time_out), allow.cartesian = TRUE]
})
# user system elapsed
# 26.893 15.950 42.893
速度可以接受,但需要大量内存(最多可达60G)。
答案 1 :(得分:1)
我不知道这是否可行;这取决于“非常大”的含义。但这是一种没有循环的方法。找出哪些操作都高于time_in
且低于time_out
:
above_in <- outer(entry_exit_times$time_in,
action_times$time,
function(e, a) e <= a)
below_out <- outer(entry_exit_times$time_out,
action_times$time,
function(e, a) a <= e)
apply(above_in & below_out, 1, which)
根据您提供的数据
[[1]]
[1] 2 3 4 5 6 7 8 9 10 11 12 13 14 15
[[2]]
[1] 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
[[3]]
[1] 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
[[4]]
[1] 14 15 16
[[5]]
[1] 10 11 12 13
答案 2 :(得分:1)
我会使用lubridate::interval
来组合开始和结束时间,并使用同一个包中的%within%
函数进行搜索。与大多数 rowwise 操作一样,性能受到影响,在这种情况下我使用sapply
来构造逻辑向量以传递给filter
。 mt1022的data.table
解决方案将是一个优秀的表现者,但是,考虑到样本数据的规模,这种方法仍然是合理的。
library(dplyr)
library(lubridate)
entry_exit_times <- entry_exit_times %>%
mutate(
entry_interval = interval(time_in, time_out)
)
time_action %>%
as_tibble %>%
mutate(
keep = sapply(time, function(x) {
any(x %within% entry_exit_times$entry_interval)
})
) %>%
filter(keep) %>%
select(-keep)
# # A tibble: 25,019 x 2
# index time
# <dbl> <dttm>
# 1 2 2012-01-03 00:01:28
# 2 3 2012-01-03 00:04:15
# 3 4 2012-01-03 00:07:05
# 4 5 2012-01-03 00:08:18
# 5 6 2012-01-03 00:08:37
# 6 7 2012-01-03 00:10:56
# 7 8 2012-01-03 00:11:02
# 8 9 2012-01-03 00:12:28
# 9 10 2012-01-03 00:13:22
# 10 11 2012-01-03 00:15:54
# # ... with 25,009 more rows