查找事件的时间戳是否在一个时间间隔内

时间:2019-10-16 14:06:00

标签: r optimization parallel-processing data.table

我有一个带有事件的data.frame及其到达时间戳(微秒精度)。在第二个数据框中,我具有状态,其中包含开始时间和结束时间(=有效期)。

下面,我为循环例程编写了代码,但确实很慢。 我认为,结合data.table,map / apply,parallelization(我有12个可用内核)可以大大减少处理时间。

能否请您帮助优化我的代码?

谢谢!

options(digits.secs = 6)

start <- strptime("2019-10-16 08:00:00.789543 CET", "%Y-%m-%d %H:%M:%OS")
start <- format(start, "%Y-%m-%d %H:%M:%OS")

end <- strptime("2019-10-16 08:10:00.471123 CET", "%Y-%m-%d %H:%M:%OS")
end <- format(end, "%Y-%m-%d %H:%M:%OS")


#### events
event_timestamps <- seq.POSIXt(as.POSIXct(start), 
                    as.POSIXct(end), units = "seconds", by = .1)
events <- sprintf("event%s",seq(1:length(event_timestamps)))


events_df <- data.frame(event_timestamps, events, stringsAsFactors=FALSE)

#### states
states <- sprintf("state%s",seq(1:4))

state_start <- c("2019-10-16 07:00:00.000000 CEST",
                 "2019-10-16 08:03:00.765233 CEST",
                 "2019-10-16 08:05:03.765432 CEST",
                 "2019-10-16 08:05:03.765434")


state_end <- c("2019-10-16 08:03:00.765232 CEST",
               "2019-10-16 08:05:03.765431 CEST",
               "2019-10-16 08:05:03.765433 CEST",
               "2019-10-16 08:12:03.471122 CEST")

states_df <- data.frame(states, state_start = as.POSIXct(state_start),
                        state_end = as.POSIXct(state_end), stringsAsFactors=FALSE)

#The state dataframe contains states with non-overlapping start and end timestamps.
#That means that one event can fall into exactly one state
# the goal is for every event to find the state it belongs to

#########################################################################

library(lubridate)

# empty data.frame
resulting_df <- data.frame(events = character(),
                           state = character(),
                           stringsAsFactors=FALSE) 



# loop eventy by event
for(event in 1:nrow(events_df)) {
  # go with the event to the states data.frame
  for (state in 1:nrow(states_df)) {
    # define state's interval
    interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET') 
    # check for every event if its timestamp is within the state interval
    if (events_df$event_timestamps[event] %within% interv){

      # then write the temp data.frame
      temp <- data.frame(events = events_df$events[event],
                         state = states_df$states[state],
                         stringsAsFactors=FALSE)

      # collect events with states
      resulting_df <- dplyr::bind_rows(resulting_df, temp)
      rm(temp)
      # one event can only be in one state at a time
      # after we found the state for the event, break the inner state loop
      # and move to the next event
      break
      next
    }


  }

}

5 个答案:

答案 0 :(得分:1)

您可以在data.table中使用滚动联接。这里的想法是将每个data.table的键设置为事件时间或状态的开始时间。然后,联接会将每个事件与最近的开始状态时间进行匹配。并且由于您具有非重叠状态,因此可以实现您想要的。

## Your creation code above
#########################################################################

library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tictoc)
tic()
# empty data.frame
resulting_df <- data.frame(events = character(),
                           state = character(),
                           stringsAsFactors=FALSE) 



# loop eventy by event
for(event in 1:nrow(events_df)) {
  # go with the event to the states data.frame
  for (state in 1:nrow(states_df)) {
    # define state's interval
    interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET') 
    # check for every event if its timestamp is within the state interval
    if (events_df$event_timestamps[event] %within% interv){

      # then write the temp data.frame
      temp <- data.frame(events = events_df$events[event],
                         state = states_df$states[state],
                         stringsAsFactors=FALSE)

      # collect events with states
      resulting_df <- dplyr::bind_rows(resulting_df, temp)
      rm(temp)
      # one event can only be in one state at a time
      # after we found the state for the event, break the inner state loop
      # and move to the next event
      break
      next
    }


  }

}
toc()
#> 9.61 sec elapsed

library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday,
#>     week, yday, year
events_dt <- data.table(events_df)
states_dt <- data.table(states_df)
setkey(states_dt, state_start)
setkey(events_dt, event_timestamps)
tic()
resulting_dt <- states_dt[events_dt, roll = T][,.(events, states)]
toc()
#> 0 sec elapsed
all(data.table(resulting_df) == resulting_dt)
#> [1] TRUE

reprex package(v0.3.0)于2019-10-16创建

答案 1 :(得分:1)

您可以按如下方式使用foverlaps包中的函数data.table(非常快!):

setDT(states_df, key = c("state_start", "state_end"))
cols <- c("state_start", "state_end")
setDT(events_df)[, (cols) := event_timestamps]

foverlaps(events_df, states_df)[, paste0("i.", cols) := NULL]

要了解功能foverlaps的工作原理,最好阅读其文档here

答案 2 :(得分:0)

您可以尝试sqldf软件包。不知道它在整个数据集中的效率如何,但这应该可以工作:

library(sqldf)
sqldf('SELECT events_df.events, states_df.states 
       FROM events_df INNER JOIN states_df 
       ON events_df.event_timestamps BETWEEN states_df.state_start AND states_df.state_end')

答案 3 :(得分:0)

您可以使用intervals软件包将某物混在一起。这是我的快速而混乱的尝试:

int_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", state_start)), 
                                     as.numeric(gsub("\\D","", state_end))))
point_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", event_timestamps)), 
                                       as.numeric(gsub("\\D","", event_timestamps))))
ls = intervals::interval_included(int_mat, point_mat)
# ls[[n]] are indices of points that belong to the n-th interval

请注意,Intervals()仅接受数字矩阵,因此首先将时间戳转换为整数。所有时间戳都必须采用完全相同的格式,并包含前导/后缀零(或者只是使用与我不同的方式将它们转换为整数)。

答案 4 :(得分:0)

使用data.table中的函数供参考的某些时间:

library(data.table) #data.table_1.12.4

s <- as.POSIXct(strptime("2019-10-01 00:00:00.000000 CET", "%Y-%m-%d %H:%M:%OS"))
e <- as.POSIXct(strptime("2019-10-10 23:59:59.999999 CET", "%Y-%m-%d %H:%M:%OS"))

#8,640,000 rows
events <- data.table(TIME=seq.POSIXt(s, e, units="seconds", by=.1))[, EVENT := .I]

#863,999 rows
h <- seq.POSIXt(s, e, units="hour", by=1)
states <- data.table(STATE=seq_len(length(h)-1L), START=h[-length(h)], END=h[-1L],
    key=c("START","END"))

events_foverlap <- copy(events)[, c("START", "END") := TIME]
states_foverlap <- copy(states)
setkey(events, TIME)

dt_foverlap <- function() {
    ans <- foverlaps(events_foverlap, states_foverlap, type="any", mult="first")
    ans[, .N]
}

dt_nonequi <- function() {
    ans <- states[events, on=.(START<=TIME, END>=TIME), mult="first"]
    ans[,.N]
}

dt_roll <- function() {
    ans <- states[events, roll=TRUE]   
    ans[,.N]
}

bench::mark(dt_foverlap(), dt_nonequi(), dt_roll())

时间:

# A tibble: 3 x 13
  expression         min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result    memory             time     gc              
  <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>    <list>             <list>   <list>          
1 dt_foverlap()    2.99s    2.99s     0.335    1.24GB    1.00      1     3      2.99s <int [1]> <df[,3] [122 x 3]> <bch:tm> <tibble [1 x 3]>
2 dt_nonequi()     3.78s    3.78s     0.265  372.55MB    0.265     1     1      3.78s <int [1]> <df[,3] [43 x 3]>  <bch:tm> <tibble [1 x 3]>
3 dt_roll()        1.09s    1.09s     0.918  329.69MB    0.918     1     1      1.09s <int [1]> <df[,3] [33 x 3]>  <bch:tm> <tibble [1 x 3]>