我想在标识符类别中采用一组可能重叠的间隔,并创建完全重叠(即相同的开始/结束值)或完全不重叠的新间隔。这些新间隔应共同跨越原始间隔的范围,并且不包括原始间隔之外的任何范围。
这需要相对快速的操作,因为我正在处理大量数据。
以下是一些示例数据:
library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id))
x <- data.table(start1,end1,id1,obs)
> x
start1 end1 id1 obs
1: 1 10 1 -0.79701638
2: 7 12 1 -0.09251333
3: 9 15 1 -0.08118742
4: 17 20 1 -2.33312797
5: 18 23 1 0.26581138
6: 1 3 2 -0.34314127
7: 3 5 2 -0.17196880
8: 20 25 2 0.11614842
输出应该是这样的:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 7 8 7 12 -0.09251333
4: 1 9 10 1 10 -0.79701638
5: 1 9 10 7 12 -0.09251333
6: 1 9 10 9 15 -0.08118742
7: 1 11 12 7 12 -0.09251333
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
我找到了与我想要的算法相对应的算法: https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541
我尝试直接对其进行编程,但是速度很慢。
答案 0 :(得分:1)
这是另一种选择。
require
输出:
#borrowing idea from https://stackoverflow.com/a/28938694/1989480
#group overlapping intervals together
x[, g := c(0L, cumsum(shift(start, -1L) > cummax(end))[-.N]), by=.(id)]
#cut those intervals into non-overlapping ones
itvl <- x[, {
s <- sort(c(start - 1L, start, end, end + 1L))
as.data.table(matrix(s[s %between% c(min(start), max(end))], ncol=2L, byrow=TRUE))
}, by=.(id, g)]
#get OP's desired output using non-equi join
x[itvl, on=.(id, start<=V1, end>=V1),
.(id1=id, start1=V1, end1=V2, i.start1=x.start, i.end1=x.end, obs),
allow.cartesian=TRUE]
数据:
id1 start1 end1 i.start1 i.end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 7 8 7 12 -0.09251333
4: 1 9 10 1 10 -0.79701638
5: 1 9 10 7 12 -0.09251333
6: 1 9 10 9 15 -0.08118742
7: 1 11 12 7 12 -0.09251333
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
答案 1 :(得分:0)
这是我的解决方案。
它基于此处(https://softwareengineering.stackexchange.com/questions/363091/split-overlapping-ranges-into-all-unique-ranges?newreg=93383e379afe4dd3a595480528ee1541)的算法,但是使用data.table,shift和向量化ifelse语句来提高效率。它也与算法不同,因为我的代码允许对id_column标识的多个数据集分别执行此操作。我的方法还忽略了对行的跟踪(即“属性”),因为当可以随时使用foverlaps
将间隔轻松地合并回原始数据时,不必定义行。 foverlaps还可以消除空白
请告诉我您是否发现效率低下
remove_overlaps <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
#excluding end_next when missing should cause this to ignore the last row in each group
#because this element will be NA as defined by shift
temp <- xd[,.SD[!is.na(end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp)
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
remove_overlaps(x, start_column="start1",end_column="end1",id_column="id1")
此外,就价值而言,我不认为that page上链接的建议在如何排除差距方面是正确的。
此答案未考虑差距(差距不应出现在 输出),因此我对其进行了优化:*如果e = false,则将a添加到S。如果e = true,则采用 *定义n'= n如果e = false或n'= n + 1如果e = true *定义 如果f = false则m'= m-1或如果f = true则m'= m *如果n'<= m'并且(e而不是f)= false,输出(n',m',S),否则不输出。 – Augman 18-23的12:19
这是在R中实现的该代码算法的第二个版本:remove_overlaps没有显式使用silentman。它的建议是排除空白,而remove_overlaps1使用该建议。请注意,这两个函数都不会通过后续对foverlaps的调用来排除间隙,只有在间隙与x(原始数据)中的间隙部分匹配时,它们才会返回间隔。
library(data.table)
remove_overlaps1 <- function(x, start_column, end_column, id_column=NULL){
xd <- melt(x[,c(start_column,end_column,id_column),with=FALSE],id=id_column)
xd[variable==start_column,end:=FALSE]
xd[variable==end_column,end:=TRUE]
setorderv(xd,c(id_column, "value","end"))
xd[,end_next:=shift(end,type="lead"),by=id_column]
xd[,value_next:=shift(value,type="lead"),by=id_column]
###subset to rows where (e & !f) = FALSE, as per comment suggestion on linked answer
temp <- xd[,.SD[!is.na(end_next)&!(end & !end_next),list(
start=ifelse(!end,value,value+1),
end=ifelse(!end_next,value_next-1,value_next)
)],by=id_column]
temp <- temp[end>=start]
setnames(temp , c("start","end"),c(start_column,end_column))
setkeyv(temp,c(id_column,start_column,end_column))
out <- foverlaps(x,temp) #this should exclude gaps since foverlaps by default subsets to
setorderv(out, c(id_column,start_column,
paste0("i.",start_column),
paste0("i.",end_column)
))
out
}
示例数据:
library(data.table)
x <-
structure(
list(
native_id = c(
"1",
"1",
"1",
"1",
"1"
),
n_start_date = c(14761, 14775,
14789, 14803, 14817),
n_end_date = c(14776, 14790, 14804, 14818,
14832),
obs = c(
31.668140525481,
34.8623263656539,
35.0841466093899,
37.2281249364127,
36.3726151694052
)
),
row.names = c(NA,-5L),
class = "data.frame",
.Names = c("native_id",
"n_start_date", "n_end_date", "obs")
)
setDT(x)
> x
native_id n_start_date n_end_date obs
1: 1 14761 14776 31.66814
2: 1 14775 14790 34.86233
3: 1 14789 14804 35.08415
4: 1 14803 14818 37.22812
5: 1 14817 14832 36.37262
结果:
> remove_overlaps(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14777 14788 14775 14790 34.86233
5: 1 14789 14790 14775 14790 34.86233
6: 1 14789 14790 14789 14804 35.08415
7: 1 14791 14802 14789 14804 35.08415
8: 1 14803 14804 14789 14804 35.08415
9: 1 14803 14804 14803 14818 37.22812
10: 1 14805 14816 14803 14818 37.22812
11: 1 14817 14818 14803 14818 37.22812
12: 1 14817 14818 14817 14832 36.37262
13: 1 14819 14832 14817 14832 36.37262
看似不正确,排除了太多间隔:
> remove_overlaps1(x, start_column="n_start_date",end_column="n_end_date",id_column="native_id")
native_id n_start_date n_end_date i.n_start_date i.n_end_date obs
1: 1 14761 14774 14761 14776 31.66814
2: 1 14775 14776 14761 14776 31.66814
3: 1 14775 14776 14775 14790 34.86233
4: 1 14789 14790 14775 14790 34.86233
5: 1 14789 14790 14789 14804 35.08415
6: 1 14803 14804 14789 14804 35.08415
7: 1 14803 14804 14803 14818 37.22812
8: 1 14817 14818 14803 14818 37.22812
9: 1 14817 14818 14817 14832 36.37262
10: 1 14819 14832 14817 14832 36.37262
答案 2 :(得分:0)
我为此和一些相关功能编写了一个程序包intervalaverage
:
library(data.table)
set.seed(1113)
start1 <- c(1,7,9, 17, 18,1,3,20)
end1 <- c(10,12,15, 20, 23,3,5,25)
id1 <- c(1,1,1,1,1,2,2,2)
obs <- rnorm(length(id1))
x <- data.table(start1,end1,id1,obs)
library(intervalaverage)
x[, start1:=as.integer(start1)]
x[, end1:=as.integer(end1)]
isolateoverlaps(x,interval_vars = c("start1","end1"),group_vars = "id1")
id1 start end start1 end1 obs
1: 1 1 6 1 10 -0.79701638
2: 1 7 8 1 10 -0.79701638
3: 1 9 10 1 10 -0.79701638
4: 1 7 8 7 12 -0.09251333
5: 1 9 10 7 12 -0.09251333
6: 1 11 12 7 12 -0.09251333
7: 1 9 10 9 15 -0.08118742
8: 1 11 12 9 15 -0.08118742
9: 1 13 15 9 15 -0.08118742
10: 1 17 17 17 20 -2.33312797
11: 1 18 20 17 20 -2.33312797
12: 1 18 20 18 23 0.26581138
13: 1 21 23 18 23 0.26581138
14: 2 1 2 1 3 -0.34314127
15: 2 3 3 1 3 -0.34314127
16: 2 3 3 3 5 -0.17196880
17: 2 4 5 3 5 -0.17196880
18: 2 20 25 20 25 0.11614842
y <- data.table(start1=c(1L,5L,5L),end1=c(5L,5L,10L),id=c(1L,1L,1L))
isolateoverlaps(y,interval_vars = c("start1","end1"),group_vars = "id")
id start end start1 end1
1: 1 1 4 1 5
2: 1 5 5 1 5
3: 1 5 5 5 5
4: 1 5 5 5 10
5: 1 6 10 5 10