我需要在单个数据集中找到重叠范围,但需要针对每个ID或因子水平找到它们。任何帮助将不胜感激!
library(dplyr)
df_foo = read.table(
textConnection("Class Min Max
A 500 630
A 100 200
B 100 200
A 210 310
A 200 210
B 210 310
A 510 530
B 200 210
A 705 800
B 500 630
B 510 530
B 705 800"), header = TRUE
)
c = outer(df_foo$Max, df_foo$Min, ">")
d = outer(df_foo$Min, df_foo$Max, "<")
df_foo %>%
mutate(Overlap = apply(c & d, 1, sum) > 1
)
我得到的结果如下:
Class Min Max Overlap
1 A 500 630 TRUE
2 A 100 200 TRUE
3 B 100 200 TRUE
4 A 210 310 TRUE
5 A 200 210 TRUE
6 B 210 310 TRUE
7 A 510 530 TRUE
8 B 200 210 TRUE
9 A 705 800 TRUE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 TRUE
但是我想这样查找A和B的每个层之间的重叠部分:
Class Min Max Overlap
1 A 500 630 TRUE
2 A 100 200 FALSE
3 B 100 200 FALSE
4 A 210 310 FALSE
5 A 200 210 FALSE
6 B 210 310 FALSE
7 A 510 530 TRUE
8 B 200 210 FALSE
9 A 705 800 FALSE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 FALSE
答案 0 :(得分:1)
我在data.table
中得到了答案,到dplyr
的翻译应该是straigtworfard。这个想法是为每个类创建一个先前累积的最大值的向量:
df_foo <- setDT(df_foo)
df_foo[, shiftedmaxmax := c(NA,cummax(Max)[1:(.N-1)]),by = Class ]
Class Min Max shiftedmaxmax
1: A 100 200 NA
2: A 200 210 200
3: A 210 310 210
4: A 500 630 310
5: A 510 530 630
6: A 705 800 630
7: B 100 200 NA
8: B 200 210 200
9: B 210 310 210
10: B 500 630 310
11: B 510 530 630
12: B 705 800 630
然后将最小值与先前的最大值(最大值)进行比较(因此,如果低于该值,则当前行在前一行的范围内)
df_foo[,superposed := Min < shiftedmaxmax]
Class Min Max shiftedmaxmax superposed
1: A 100 200 NA NA
2: A 200 210 200 FALSE
3: A 210 310 210 FALSE
4: A 500 630 310 FALSE
5: A 510 530 630 TRUE
6: A 705 800 630 FALSE
7: B 100 200 NA NA
8: B 200 210 200 FALSE
9: B 210 310 210 FALSE
10: B 500 630 310 FALSE
11: B 510 530 630 TRUE
12: B 705 800 630 FALSE
缺少第一个叠加层,您可以通过以下方式获取它:
df_foo[,superposedsource := Max %in% shiftedmaxmax[superposed],by = Class]
df_foo[,superposedtot := ifelse((superposed | superposedsource) &,T,F)]
Class Min Max shiftedmaxmax superposed superposedsource superposedtot
1: A 100 200 NA NA FALSE NA
2: A 200 210 200 FALSE FALSE FALSE
3: A 210 310 210 FALSE FALSE FALSE
4: A 500 630 310 FALSE TRUE TRUE
5: A 510 530 630 TRUE FALSE TRUE
6: A 705 800 630 FALSE FALSE FALSE
7: B 100 200 NA NA FALSE NA
8: B 200 210 200 FALSE FALSE FALSE
9: B 210 310 210 FALSE FALSE FALSE
10: B 500 630 310 FALSE TRUE TRUE
11: B 510 530 630 TRUE FALSE TRUE
12: B 705 800 630 FALSE FALSE FALSE
答案 1 :(得分:0)
与dplyr
df=df_foo%>%group_by(Class)%>%
mutate(Overlap=if_else(Min<lag(Max,order_by=Class),TRUE,FALSE))
df$Overlap[which(df$Overlap==TRUE)-1]=TRUE
df$Overlap[which(is.na(df$Overlap))]=FALSE
> df
# A tibble: 12 x 4
# Groups: Class [2]
Class Min Max Overlap
<fct> <dbl> <dbl> <lgl>
1 A 100 200 FALSE
2 A 200 210 FALSE
3 A 210 310 FALSE
4 A 500 630 TRUE
5 A 510 530 TRUE
6 A 705 800 FALSE
7 B 100 200 FALSE
8 B 200 210 FALSE
9 B 210 310 FALSE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 FALSE
此代码假定您的值按升序排列,因为它仅检查上一行。
修改
不是最漂亮,但可以。
df_foo$Class=as.character.factor(df_foo$Class)
df_foo=as.data.frame(df_foo)
df_foo$Overlap=rep("FALSE",nrow(df_foo))
for (i in 1:nrow(df_foo)){
aux=FALSE
class=df_foo$Class[i]
df=df_foo[-i,]%>%filter(.,Class==class)
for (j in 1:nrow(df)){
if (df_foo[i,"Min"]<df[j,"Max"] & df_foo[i,"Max"] > df[j,"Min"]){
aux=TRUE
}
}
df_foo[i,"Overlap"]=aux
}
> df_foo
Class Min Max Overlap
1 A 500 630 TRUE
2 A 100 200 FALSE
3 B 100 200 FALSE
4 A 210 310 FALSE
5 A 200 210 FALSE
6 B 210 310 FALSE
7 A 510 530 TRUE
8 B 200 210 FALSE
9 A 705 800 FALSE
10 B 500 630 TRUE
11 B 510 530 TRUE
12 B 705 800 FALSE
必须有一种使用dplyr
的方法,但是我无法弄清楚。发生的事情是它遍历df_foo
的每一行;它会与同一组的所有其他行生成一个dataframe
,并比较是否存在重叠(min<max
和max<min
)
答案 2 :(得分:0)
另一种data.table
方法。
示例数据/范围的顺序与该答案无关... foverlaps()
为您完成了所有艰苦的工作。
样本数据
library( data.table )
dt <- as.data.table( df_foo )
代码
#set key for the data.table
setkey(dt, Min, Max)
#perform overlap join, keep only joined ranges where the class is the same, and Min and Max are not the same.
result <- foverlaps( dt, dt )[ Class == i.Class & !(Min == i.Min | Max == i.Max | Min == i.Max | Max == i.Min), ]
#create a logical vector (i.e. Overlap) by checking if the (pasted) combination of
#Class, Min and Max exists in both 'dt' and 'result'
dt[ , Overlap := paste0( Class, Min, Max ) %in% paste0( result$Class, result$Min, result$Max) ][]
# Class Min Max Overlap
# 1: A 100 200 FALSE
# 2: B 100 200 FALSE
# 3: A 200 210 FALSE
# 4: B 200 210 FALSE
# 5: A 210 310 FALSE
# 6: B 210 310 FALSE
# 7: A 500 630 TRUE
# 8: B 500 630 TRUE
# 9: A 510 530 TRUE
# 10: B 510 530 TRUE
# 11: A 705 800 FALSE
# 12: B 705 800 FALSE