Question

在R中，我有一组时间间隔，其中一些时间间隔相互重叠，这些重叠可以形成重叠链（间隔A重叠B，B重叠C，但A不重叠C）。我想找到涵盖这组间隔的最小间隔集。

我有一个使用lubridate间隔的解决方案，但它使用较旧的范例，例如推送和弹出一叠间隔。该解决方案如下。我想知道我是否缺少一个更简单的功能解决方案或包应该为我做这个（我担心我的代码很脆弱，宁愿使用经过试验和测试的解决方案）。

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(lubridate))

# In this table of intervals, rows 3,4, and 5 form a chain of intervals.  They should be rolled into 1 interval.
# And note rows 3 and 5 do not themselves overlap, but they are chained together by having overlap with row 4.
dat <- read.csv(text="
start,end
2017-09-01 00:00,2017-09-01 00:01
2017-09-01 00:02,2017-09-01 00:03
2017-09-01 00:04,2017-09-01 00:08
2017-09-01 00:07,2017-09-01 00:15
2017-09-01 00:09,2017-09-01 00:16
2017-09-01 00:20,2017-09-01 00:22") %>%
  transmute(
    gtStart = ymd_hm(start)
    , gtEnd = ymd_hm(end))


iv_clean <- list()
iv_process <- interval(dat$gtStart, dat$gtEnd)

while(length(iv_process) > 0) {
  e <- iv_process[1]
  iv_process <- iv_process[-1]

  ## If e is last item in iv_process, add it to iv_clean and stop processing
  if (!length(iv_process)) {
    if (!length(iv_clean)) {
      iv_clean <- e
    } else {
      iv_clean <- c(e, iv_clean)
    }
    break
  }

  ## For every remaining interval that overlaps e, union it with e
  ## And trip a flag that says that we found an overlapping interval
  e_nonoverlapping <- TRUE
  for (i in 1:length(iv_process)) {
    if (int_overlaps(e, iv_process[i])) {
      e_nonoverlapping <- FALSE
      iv_process[i] <- union(e, iv_process[i])
    }
  }

  ## If e did not overlap with any interval, then add it to iv_clean
  ## Otherwise, don't, and continue processing iv_process
  if (e_nonoverlapping) {
    if (!length(iv_clean)) {
      iv_clean <- e
    } else {
      iv_clean <- c(e, iv_clean)
    }
  }
}


## Print result
print(iv_clean)
#> [1] 2017-09-01 00:20:00 UTC--2017-09-01 00:22:00 UTC
#> [2] 2017-09-01 00:04:00 UTC--2017-09-01 00:16:00 UTC
#> [3] 2017-09-01 00:02:00 UTC--2017-09-01 00:03:00 UTC
#> [4] 2017-09-01 00:00:00 UTC--2017-09-01 00:01:00 UTC

Answer 1

我会以递归/准功能的方式做到这一点：

#finds the overlap end points
get_overlap<-function(start, end, dat){
  #which ones start before the base case ends?
  overlap<- which(dat$gtStart < end)

  if(length(overlap) == 1){
    return(list(start = start, end = end ))
  }

  else{

    #if we have more than 1 event in our overlap, find the new end point
    #drop the first row and recurse until we find the end of the interval.
    end<-max(dat[overlap,]$gtEnd)
    return(get_overlap(start, end, dat[-1,]))
  }
}

#walks through the df and find the intervals. assumes the df is sorted as your example.
recur<-function(dat, intervals){
  #base case
  if(nrow(dat) == 0){
    return(intervals)
  }

  start <-dat[1,]$gtStart
  end<- dat[1,]$gtEnd

  indices<-get_overlap(start, end, dat)

  end_row<-which(dat$gtEnd == indices$end)

  intervals[[length(intervals)+1]]<-list(
    start = dat[1,]$gtStart,
    end = indices$end,
    n_events = nrow(dat[1:end_row,]),
    dat = dat[1:end_row,])

  #remove the events from the last interval and recurse
  return(recur(dat[-(1:end_row),], intervals))
}

intervals<-recur(dat, list())

如果您有大量数据，那么在R中执行类似的操作会受到影响。有一个递归限制，我相信它默认为5000.如果代码中有什么不对，如果会很快达到。我认为pythons stack depth是1000，供参考。

您可以使用options(expressions = <some number>)来处理递归限制。但要小心，这些东西可以很快地通过记忆来咀嚼。

合并时间间隔集，间隔链到单个间隔

1 个答案: