我有一个带有事件的data.frame及其到达时间戳(微秒精度)。在第二个数据框中,我具有状态,其中包含开始时间和结束时间(=有效期)。
下面,我为循环例程编写了代码,但确实很慢。 我认为,结合data.table,map / apply,parallelization(我有12个可用内核)可以大大减少处理时间。
能否请您帮助优化我的代码?
谢谢!
options(digits.secs = 6)
start <- strptime("2019-10-16 08:00:00.789543 CET", "%Y-%m-%d %H:%M:%OS")
start <- format(start, "%Y-%m-%d %H:%M:%OS")
end <- strptime("2019-10-16 08:10:00.471123 CET", "%Y-%m-%d %H:%M:%OS")
end <- format(end, "%Y-%m-%d %H:%M:%OS")
#### events
event_timestamps <- seq.POSIXt(as.POSIXct(start),
as.POSIXct(end), units = "seconds", by = .1)
events <- sprintf("event%s",seq(1:length(event_timestamps)))
events_df <- data.frame(event_timestamps, events, stringsAsFactors=FALSE)
#### states
states <- sprintf("state%s",seq(1:4))
state_start <- c("2019-10-16 07:00:00.000000 CEST",
"2019-10-16 08:03:00.765233 CEST",
"2019-10-16 08:05:03.765432 CEST",
"2019-10-16 08:05:03.765434")
state_end <- c("2019-10-16 08:03:00.765232 CEST",
"2019-10-16 08:05:03.765431 CEST",
"2019-10-16 08:05:03.765433 CEST",
"2019-10-16 08:12:03.471122 CEST")
states_df <- data.frame(states, state_start = as.POSIXct(state_start),
state_end = as.POSIXct(state_end), stringsAsFactors=FALSE)
#The state dataframe contains states with non-overlapping start and end timestamps.
#That means that one event can fall into exactly one state
# the goal is for every event to find the state it belongs to
#########################################################################
library(lubridate)
# empty data.frame
resulting_df <- data.frame(events = character(),
state = character(),
stringsAsFactors=FALSE)
# loop eventy by event
for(event in 1:nrow(events_df)) {
# go with the event to the states data.frame
for (state in 1:nrow(states_df)) {
# define state's interval
interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET')
# check for every event if its timestamp is within the state interval
if (events_df$event_timestamps[event] %within% interv){
# then write the temp data.frame
temp <- data.frame(events = events_df$events[event],
state = states_df$states[state],
stringsAsFactors=FALSE)
# collect events with states
resulting_df <- dplyr::bind_rows(resulting_df, temp)
rm(temp)
# one event can only be in one state at a time
# after we found the state for the event, break the inner state loop
# and move to the next event
break
next
}
}
}
答案 0 :(得分:1)
您可以在data.table
中使用滚动联接。这里的想法是将每个data.table
的键设置为事件时间或状态的开始时间。然后,联接会将每个事件与最近的开始状态时间进行匹配。并且由于您具有非重叠状态,因此可以实现您想要的。
## Your creation code above
#########################################################################
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
library(tictoc)
tic()
# empty data.frame
resulting_df <- data.frame(events = character(),
state = character(),
stringsAsFactors=FALSE)
# loop eventy by event
for(event in 1:nrow(events_df)) {
# go with the event to the states data.frame
for (state in 1:nrow(states_df)) {
# define state's interval
interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET')
# check for every event if its timestamp is within the state interval
if (events_df$event_timestamps[event] %within% interv){
# then write the temp data.frame
temp <- data.frame(events = events_df$events[event],
state = states_df$states[state],
stringsAsFactors=FALSE)
# collect events with states
resulting_df <- dplyr::bind_rows(resulting_df, temp)
rm(temp)
# one event can only be in one state at a time
# after we found the state for the event, break the inner state loop
# and move to the next event
break
next
}
}
}
toc()
#> 9.61 sec elapsed
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#>
#> hour, isoweek, mday, minute, month, quarter, second, wday,
#> week, yday, year
events_dt <- data.table(events_df)
states_dt <- data.table(states_df)
setkey(states_dt, state_start)
setkey(events_dt, event_timestamps)
tic()
resulting_dt <- states_dt[events_dt, roll = T][,.(events, states)]
toc()
#> 0 sec elapsed
all(data.table(resulting_df) == resulting_dt)
#> [1] TRUE
由reprex package(v0.3.0)于2019-10-16创建
答案 1 :(得分:1)
您可以按如下方式使用foverlaps
包中的函数data.table
(非常快!):
setDT(states_df, key = c("state_start", "state_end"))
cols <- c("state_start", "state_end")
setDT(events_df)[, (cols) := event_timestamps]
foverlaps(events_df, states_df)[, paste0("i.", cols) := NULL]
要了解功能foverlaps
的工作原理,最好阅读其文档here
答案 2 :(得分:0)
您可以尝试sqldf软件包。不知道它在整个数据集中的效率如何,但这应该可以工作:
library(sqldf)
sqldf('SELECT events_df.events, states_df.states
FROM events_df INNER JOIN states_df
ON events_df.event_timestamps BETWEEN states_df.state_start AND states_df.state_end')
答案 3 :(得分:0)
您可以使用intervals
软件包将某物混在一起。这是我的快速而混乱的尝试:
int_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", state_start)),
as.numeric(gsub("\\D","", state_end))))
point_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", event_timestamps)),
as.numeric(gsub("\\D","", event_timestamps))))
ls = intervals::interval_included(int_mat, point_mat)
# ls[[n]] are indices of points that belong to the n-th interval
请注意,Intervals()
仅接受数字矩阵,因此首先将时间戳转换为整数。所有时间戳都必须采用完全相同的格式,并包含前导/后缀零(或者只是使用与我不同的方式将它们转换为整数)。
答案 4 :(得分:0)
使用data.table
中的函数供参考的某些时间:
library(data.table) #data.table_1.12.4
s <- as.POSIXct(strptime("2019-10-01 00:00:00.000000 CET", "%Y-%m-%d %H:%M:%OS"))
e <- as.POSIXct(strptime("2019-10-10 23:59:59.999999 CET", "%Y-%m-%d %H:%M:%OS"))
#8,640,000 rows
events <- data.table(TIME=seq.POSIXt(s, e, units="seconds", by=.1))[, EVENT := .I]
#863,999 rows
h <- seq.POSIXt(s, e, units="hour", by=1)
states <- data.table(STATE=seq_len(length(h)-1L), START=h[-length(h)], END=h[-1L],
key=c("START","END"))
events_foverlap <- copy(events)[, c("START", "END") := TIME]
states_foverlap <- copy(states)
setkey(events, TIME)
dt_foverlap <- function() {
ans <- foverlaps(events_foverlap, states_foverlap, type="any", mult="first")
ans[, .N]
}
dt_nonequi <- function() {
ans <- states[events, on=.(START<=TIME, END>=TIME), mult="first"]
ans[,.N]
}
dt_roll <- function() {
ans <- states[events, roll=TRUE]
ans[,.N]
}
bench::mark(dt_foverlap(), dt_nonequi(), dt_roll())
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 dt_foverlap() 2.99s 2.99s 0.335 1.24GB 1.00 1 3 2.99s <int [1]> <df[,3] [122 x 3]> <bch:tm> <tibble [1 x 3]>
2 dt_nonequi() 3.78s 3.78s 0.265 372.55MB 0.265 1 1 3.78s <int [1]> <df[,3] [43 x 3]> <bch:tm> <tibble [1 x 3]>
3 dt_roll() 1.09s 1.09s 0.918 329.69MB 0.918 1 1 1.09s <int [1]> <df[,3] [33 x 3]> <bch:tm> <tibble [1 x 3]>