Question

我想在标识符类别中采用一组可能重叠的间隔，并创建完全重叠（即相同的开始/结束值）或完全不重叠的新间隔。这些新间隔应共同跨越原始间隔的范围，并且不包括原始间隔之外的任何范围。

这需要相对快速的操作，因为我正在处理大量数据。

以下是一些示例数据：

library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id))
x <- data.table(start1,end1,id1,obs)

    > x
   start1 end1 id1         obs
1:      1   10   1 -0.79701638
2:      7   12   1 -0.09251333
3:      9   15   1 -0.08118742
4:     17   20   1 -2.33312797
5:     18   23   1  0.26581138
6:      1    3   2 -0.34314127
7:      3    5   2 -0.17196880
8:     20   25   2  0.11614842

输出应该是这样的：

    id1 start1 end1 i.start1 i.end1         obs
 1:   1      1    6        1     10 -0.79701638
 2:   1      7    8        1     10 -0.79701638
 3:   1      7    8        7     12 -0.09251333
 4:   1      9   10        1     10 -0.79701638
 5:   1      9   10        7     12 -0.09251333
 6:   1      9   10        9     15 -0.08118742
 7:   1     11   12        7     12 -0.09251333
 8:   1     11   12        9     15 -0.08118742
 9:   1     13   15        9     15 -0.08118742
10:   1     17   17       17     20 -2.33312797
11:   1     18   20       17     20 -2.33312797
12:   1     18   20       18     23  0.26581138
13:   1     21   23       18     23  0.26581138
14:   2      1    2        1      3 -0.34314127
15:   2      3    3        1      3 -0.34314127
16:   2      3    3        3      5 -0.17196880
17:   2      4    5        3      5 -0.17196880
18:   2     20   25       20     25  0.11614842

我找到了与我想要的算法相对应的算法： https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541

我尝试直接对其进行编程，但是速度很慢。

Answer 1

这是另一种选择。

require

输出：

#borrowing idea from https://stackoverflow.com/a/28938694/1989480
#group overlapping intervals together
x[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]

#cut those intervals into non-overlapping ones
itvl <- x[, {
    s <- sort(c(start - 1L, start, end, end + 1L))
    as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
    }, by=.(id, g)]

#get OP's desired output using non-equi join
x[itvl, on=.(id, start<=V1, end>=V1),
    .(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
    allow.cartesian=TRUE]

数据：

    id1 start1 end1 i.start1 i.end1         obs
 1:   1      1    6        1     10 -0.79701638
 2:   1      7    8        1     10 -0.79701638
 3:   1      7    8        7     12 -0.09251333
 4:   1      9   10        1     10 -0.79701638
 5:   1      9   10        7     12 -0.09251333
 6:   1      9   10        9     15 -0.08118742
 7:   1     11   12        7     12 -0.09251333
 8:   1     11   12        9     15 -0.08118742
 9:   1     13   15        9     15 -0.08118742
10:   1     17   17       17     20 -2.33312797
11:   1     18   20       17     20 -2.33312797
12:   1     18   20       18     23  0.26581138
13:   1     21   23       18     23  0.26581138
14:   2      1    2        1      3 -0.34314127
15:   2      3    3        1      3 -0.34314127
16:   2      3    3        3      5 -0.17196880
17:   2      4    5        3      5 -0.17196880
18:   2     20   25       20     25  0.11614842

Answer 2

这是我的解决方案。它基于此处（https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541）的算法，但是使用data.table，shift和向量化ifelse语句来提高效率。它也与算法不同，因为我的代码允许对id_column标识的多个数据集分别执行此操作。我的方法还忽略了对行的跟踪（即“属性”），因为当可以随时使用foverlaps将间隔轻松地合并回原始数据时，不必定义行。 foverlaps还可以消除空白

请告诉我您是否发现效率低下

remove_overlaps <- function(x, start_column, end_column, id_column=NULL){

  xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)

  xd[variable==start_column,end:=FALSE]
  xd[variable==end_column,end:=TRUE]
  setorderv(xd,c(id_column, "value","end"))

  xd[,end_next:=shift(end,type="lead"),by=id_column]
  xd[,value_next:=shift(value,type="lead"),by=id_column]


  #excluding end_next when missing should cause this to ignore the last row in each group
  #because this element will be NA as defined by shift
  temp <- xd[,.SD[!is.na(end_next),list(
    start=ifelse(!end,value,value+1),
    end=ifelse(!end_next,value_next-1,value_next)
  )],by=id_column]

  temp <- temp[end>=start]

  setnames(temp , c("start","end"),c(start_column,end_column))

  setkeyv(temp,c(id_column,start_column,end_column))

  out <- foverlaps(x,temp)
  setorderv(out, c(id_column,start_column,
                   paste0("i.",start_column),
                   paste0("i.",end_column)
  ))

  out
}
remove_overlaps(x, start_column="start1",end_column="end1",id_column="id1")

此外，就价值而言，我不认为that page上链接的建议在如何排除差距方面是正确的。

此答案未考虑差距（差距不应出现在输出），因此我对其进行了优化：*如果e = false，则将a添加到S。如果e = true，则采用 *定义n'= n如果e = false或n'= n + 1如果e = true *定义如果f = false则m'= m-1或如果f = true则m'= m *如果n'<= m'并且（e而不是f）= false，输出（n'，m'，S），否则不输出。 – Augman 18-23的12:19

这是在R中实现的该代码算法的第二个版本：remove_overlaps没有显式使用silentman。它的建议是排除空白，而remove_overlaps1使用该建议。请注意，这两个函数都不会通过后续对foverlaps的调用来排除间隙，只有在间隙与x（原始数据）中的间隙部分匹配时，它们才会返回间隔。

library(data.table)



remove_overlaps1 <- function(x, start_column, end_column, id_column=NULL){

  xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)

  xd[variable==start_column,end:=FALSE]
  xd[variable==end_column,end:=TRUE]
  setorderv(xd,c(id_column, "value","end"))

  xd[,end_next:=shift(end,type="lead"),by=id_column]
  xd[,value_next:=shift(value,type="lead"),by=id_column]

###subset to rows where (e & !f) = FALSE, as per comment suggestion on linked answer
  temp <- xd[,.SD[!is.na(end_next)&!(end & !end_next),list(
    start=ifelse(!end,value,value+1),
    end=ifelse(!end_next,value_next-1,value_next)
  )],by=id_column]

  temp <- temp[end>=start]

  setnames(temp , c("start","end"),c(start_column,end_column))

  setkeyv(temp,c(id_column,start_column,end_column))


  out <- foverlaps(x,temp) #this should exclude gaps since foverlaps by default subsets to 
  setorderv(out, c(id_column,start_column,
                   paste0("i.",start_column),
                   paste0("i.",end_column)
  ))

  out
}

示例数据：

library(data.table)
x <-
  structure(
    list(
      native_id = c(
        "1",
        "1",
        "1",
        "1",
        "1"
      ),
      n_start_date = c(14761, 14775,
                       14789, 14803, 14817),
      n_end_date = c(14776, 14790, 14804, 14818,
                     14832),
      obs = c(
        31.668140525481,
        34.8623263656539,
        35.0841466093899,
        37.2281249364127,
        36.3726151694052
      )
    ),
    row.names = c(NA,-5L),
    class = "data.frame",
    .Names = c("native_id",
               "n_start_date", "n_end_date", "obs")
  )

setDT(x)

> x
   native_id n_start_date n_end_date      obs
1:         1        14761      14776 31.66814
2:         1        14775      14790 34.86233
3:         1        14789      14804 35.08415
4:         1        14803      14818 37.22812
5:         1        14817      14832 36.37262

结果：

> remove_overlaps(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
    native_id n_start_date n_end_date i.n_start_date i.n_end_date      obs
 1:         1        14761      14774          14761        14776 31.66814
 2:         1        14775      14776          14761        14776 31.66814
 3:         1        14775      14776          14775        14790 34.86233
 4:         1        14777      14788          14775        14790 34.86233
 5:         1        14789      14790          14775        14790 34.86233
 6:         1        14789      14790          14789        14804 35.08415
 7:         1        14791      14802          14789        14804 35.08415
 8:         1        14803      14804          14789        14804 35.08415
 9:         1        14803      14804          14803        14818 37.22812
10:         1        14805      14816          14803        14818 37.22812
11:         1        14817      14818          14803        14818 37.22812
12:         1        14817      14818          14817        14832 36.37262
13:         1        14819      14832          14817        14832 36.37262

看似不正确，排除了太多间隔：

>  remove_overlaps1(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
    native_id n_start_date n_end_date i.n_start_date i.n_end_date      obs
 1:         1        14761      14774          14761        14776 31.66814
 2:         1        14775      14776          14761        14776 31.66814
 3:         1        14775      14776          14775        14790 34.86233
 4:         1        14789      14790          14775        14790 34.86233
 5:         1        14789      14790          14789        14804 35.08415
 6:         1        14803      14804          14789        14804 35.08415
 7:         1        14803      14804          14803        14818 37.22812
 8:         1        14817      14818          14803        14818 37.22812
 9:         1        14817      14818          14817        14832 36.37262
10:         1        14819      14832          14817        14832 36.37262

Answer 3

我为此和一些相关功能编写了一个程序包intervalaverage：

library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id1))
x <- data.table(start1,end1,id1,obs)

library(intervalaverage)

x[, start1:=as.integer(start1)]
x[, end1:=as.integer(end1)]
isolateoverlaps(x,interval_vars = c("start1","end1"),group_vars = "id1")

    id1 start end start1 end1         obs
 1:   1     1   6      1   10 -0.79701638
 2:   1     7   8      1   10 -0.79701638
 3:   1     9  10      1   10 -0.79701638
 4:   1     7   8      7   12 -0.09251333
 5:   1     9  10      7   12 -0.09251333
 6:   1    11  12      7   12 -0.09251333
 7:   1     9  10      9   15 -0.08118742
 8:   1    11  12      9   15 -0.08118742
 9:   1    13  15      9   15 -0.08118742
10:   1    17  17     17   20 -2.33312797
11:   1    18  20     17   20 -2.33312797
12:   1    18  20     18   23  0.26581138
13:   1    21  23     18   23  0.26581138
14:   2     1   2      1    3 -0.34314127
15:   2     3   3      1    3 -0.34314127
16:   2     3   3      3    5 -0.17196880
17:   2     4   5      3    5 -0.17196880
18:   2    20  25     20   25  0.11614842

y <- data.table(start1=c(1L,5L,5L),end1=c(5L,5L,10L),id=c(1L,1L,1L)) 
isolateoverlaps(y,interval_vars = c("start1","end1"),group_vars = "id")

   id start end start1 end1
1:  1     1   4      1    5
2:  1     5   5      1    5
3:  1     5   5      5    5
4:  1     5   5      5   10
5:  1     6  10      5   10

在标识符的值内，将重叠间隔分为非重叠间隔

3 个答案: