我有以下数据集,并希望在单独的列中获取重叠标签的数量(n.overlaps
),重叠标签的名称(overlap.labels
)以及重叠的持续时间(overlap.duration
)。
这是我的数据集:
label begin end
======================
lower 9.03 12.41
lower 28.773 29.975
lower 33.895 35.992
lower 46.814 48.854
lower 58.51 61.51
lower 62.971 63.491
upper 28.132 30.432
upper 46.716 50.82
upper 58.536 61.482
upper 29.975 33.895
upper 53.376 54.08
upper 10.358 11.958
upper 30.532 46.716
upper 51.633 58.536
head 9.918 14.818
head 29.823 30.623
head 58.802 61.404
head 61.404 63.562
表格,我想得到的就是这个:
lower.begin lower.end upper.begin upper.end head.begin head.end n.overlaps overlap.labels overlap.duration
9.03 12.41 10.358 11.958 9.918 14.418 3 lower|upper|head 1.6
28.773 29.975 28.132 30.432 29.823 30.623 3 lower|upper|head 0.152
33.895 35.992 30.532 46.716 - - 2 lower|upper 2.097
...
这是表格的直观表示:
structure(list(label = c("lower", "lower", "lower", "lower",
"lower", "lower", "upper", "upper", "upper", "upper", "upper",
"upper", "upper", "upper", "head", "head", "head", "head"
), begin = c(9.03, 28.773, 33.895, 46.814, 58.51, 62.971, 28.132,
46.716, 58.536, 29.975, 53.376, 10.358, 30.532, 51.633, 9.918,
29.823, 58.802, 61.404), end = c(12.41, 29.975, 35.992, 48.854,
61.51, 63.491, 30.432, 50.82, 61.482, 33.895, 54.08, 11.958,
46.716, 58.536, 14.818, 30.623, 61.404, 63.562)), .Names = c("label",
"begin", "end"), class = "data.frame", row.names = c(NA, -18L))
答案 0 :(得分:3)
这是一个评论,但它带有一张图片。
你想要的输出很不清楚。具体来说,您的示例数据看起来好像有三组重叠,分别为青色,浅绿色和李子色。
一旦我们同意这些是三个重叠区域,它甚至不清楚你想要什么。
library(data.table); setDT(x)
cols<-c(lower="black",upper="blue",middle="red")
ys<-c(lower=1.8,upper=2.2,middle=2)
par(mar=c(2.1,4.1,4.1,1.1))
x[,{plot(1,type="n",xlim=range(onset,offset),
ylim=c(1.7,2.3),yaxt="n",ylab="",xlab="",
main="Depiction of Intervals")
axis(side=2,at=ys[unique(label)],
labels=unique(label),las=1)}]
rect(x[order(onset)][1,onset],1.7,
x[order(offset)][3,offset],2.3,col="cyan")
rect(x[order(onset)][4,onset],1.7,
x[order(offset)][11,offset],2.3,col="lightgreen")
rect(x[order(onset)][12,onset],1.7,
x[order(offset)][18,offset],2.3,col="plum")
for (lbs in x[,unique(label)]){
x[label==lbs,
arrows(onset,ys[label],offset,ys[label],lwd=3,
code=3,angle=90,length=.07,col=cols[label])]
}
答案 1 :(得分:2)
这是一个开始。当我有更多时间时,我会添加最后三列。它看起来很复杂,但我使用lubridate
将持续时间转换为时间间隔。有一个名为new_interval
的函数可以创建它们,还有一个名为int_overlaps
的函数可以测试重叠。
<强>更新强>
代码已完成。检查它是否有帮助。
library(lubridate)
starts <- as.POSIXct(df$begin, origin=Sys.time())
ends <- as.POSIXct(df$end, origin=Sys.time())
spans <- new_interval(starts, ends)
s <- split(spans, df$label)
d <- split(df, df$label)
overlap <- function(x1, x2) {
out <- sapply(1:length(s[[x1]]), function(x) {
which(int_overlaps(s[[x1]][x], s[[x2]]))}
)
mat_lst <- lapply(out, function(x) {
matrix(c(d[[x2]]$begin[x],d[[x2]]$end[x]),ncol=2)}
)
mat_lst[lengths(mat_lst) == 0L] <- list(matrix(NA, ncol=2))
mat_lst
}
lh <- overlap("lower", "head")
lu <- overlap("lower", "upper")
matches <- suppressWarnings(lapply(1:nrow(d$lower), function(x) {
cbind(d$lower[x,2:3], lu[[x]], lh[[x]])}
))
new_df <- `names<-`(do.call(rbind, matches), c("lower.begin", "lower.end", "upper.begin", "upper.end", "head.begin", "head.end"))
rownames(new_df) <- NULL
#n.overlaps
count <- colSums(apply(new_df, 1, function(x) !is.na(x)))/2
new_df$n.overlaps <- ave(count, new_df$lower.begin, FUN=function(x) x+length(x)-1)
#overlap.labels
new_df$overlap.labels <- apply(new_df[1:6], 1, function(x)
paste(unique(gsub("\\..*", "", names(which(!is.na(x))))), collapse="|"))
#overlap.duration
first <- pmin(new_df$lower.end, new_df$upper.end)-new_df$upper.begin
second <- pmin(new_df$lower.end, new_df$head.end)-new_df$head.begin
overlap <- ifelse(is.na(first+second), ifelse(is.na(first), second, first), first+second)
new_df$overlap.duration <- ave(overlap, new_df$lower.begin, FUN=sum)
new_df
# lower.begin lower.end upper.begin upper.end head.begin head.end n.overlaps
# 1 9.030 12.410 10.358 11.958 9.918 14.818 3
# 2 28.773 29.975 28.132 30.432 29.823 30.623 4
# 3 28.773 29.975 29.975 33.895 29.823 30.623 4
# 4 33.895 35.992 29.975 33.895 NA NA 3
# 5 33.895 35.992 30.532 46.716 NA NA 3
# 6 46.814 48.854 46.716 50.820 NA NA 2
# 7 58.510 61.510 58.536 61.482 58.802 61.404 4
# 8 58.510 61.510 51.633 58.536 61.404 63.562 4
# 9 62.971 63.491 NA NA 61.404 63.562 2
# overlap.labels overlap.duration
# 1 lower|upper|head 4.092
# 2 lower|upper|head 2.147
# 3 lower|upper|head 2.147
# 4 lower|upper 9.380
# 5 lower|upper 9.380
# 6 lower|upper 2.138
# 7 lower|upper|head 12.557
# 8 lower|upper|head 12.557
# 9 lower|head 2.087
更新#2
我修改了matches
功能。它应该准备好更多样化。在旧脚本中替换它。
matches <- suppressWarnings(lapply(1:nrow(d$lower), function(x) {
max.len <- max(length(c(length(lu[[x]]), length(lh[[x]]))))
xu <- lu[[x]]
xh <- lh[[x]]
dim(xu) <- dim(xh) <- NULL
length(xu) <- length(xh) <- max.len
umat <- matrix(xu, byrow=T, ncol=2)
hmat <- matrix(xh, byrow=T, ncol=2)
cbind(d$lower[x,2:3], umat, hmat)}
))
数据强>
df <- structure(list(label = c("lower", "lower", "lower", "lower",
"lower", "lower", "upper", "upper", "upper", "upper", "upper",
"upper", "upper", "upper", "head", "head", "head", "head"
), begin = c(9.03, 28.773, 33.895, 46.814, 58.51, 62.971, 28.132,
46.716, 58.536, 29.975, 53.376, 10.358, 30.532, 51.633, 9.918,
29.823, 58.802, 61.404), end = c(12.41, 29.975, 35.992, 48.854,
61.51, 63.491, 30.432, 50.82, 61.482, 33.895, 54.08, 11.958,
46.716, 58.536, 14.818, 30.623, 61.404, 63.562)), .Names = c("label",
"begin", "end"), class = "data.frame", row.names = c(NA, -18L))
答案 2 :(得分:0)
尝试使用foverlaps
中的data.table
:
subset_dat <- function(x, .label) {
ans = x[label == .label]
setnames(ans, paste(.label, names(ans), sep="_"))
}
setkey(setDT(dat), begin, end))
olaps1 = foverlaps(subset_dat(dat, "head"), subset_dat(dat, "lower"), type="any")
olaps2 = foverlaps(subset_dat(dat, "upper"), subset_dat(dat, "lower"), type="any")
ans = merge(olaps1, olaps2, by=names(olaps1)[1:3], all=TRUE)
ans[, olap.labels := paste(lower_label, head_label, upper_label, sep="|")]
ans[, olap.labels := gsub("\\|NA|NA\\|", "", olap.labels)]
ans[, c("lower_label", "head_label", "upper_label") := NULL]
ans[, olap.count := sapply(gregexpr("\\|", olap.labels), function(x) sum(x != -1L)+1L)]
ans[, olap.interval := abs(pmax(lower_begin, head_begin, upper_begin, na.rm=TRUE) -
pmin(lower_end, head_end, upper_end, na.rm=TRUE))]
# lower_begin lower_end head_begin head_end upper_begin upper_end olap.labels olap.count olap.interval
# 1: NA NA NA NA 53.376 54.080 upper 1 0.704
# 2: 9.030 12.410 9.918 14.818 10.358 11.958 lower|head|upper 3 1.600
# 3: 28.773 29.975 29.823 30.623 28.132 30.432 lower|head|upper 3 0.152
# 4: 28.773 29.975 29.823 30.623 29.975 33.895 lower|head|upper 3 0.000
# 5: 33.895 35.992 NA NA 29.975 33.895 lower|upper 2 0.000
# 6: 33.895 35.992 NA NA 30.532 46.716 lower|upper 2 2.097
# 7: 46.814 48.854 NA NA 46.716 50.820 lower|upper 2 2.040
# 8: 58.510 61.510 58.802 61.404 51.633 58.536 lower|head|upper 3 0.266
# 9: 58.510 61.510 58.802 61.404 58.536 61.482 lower|head|upper 3 2.602
# 10: 58.510 61.510 61.404 63.562 51.633 58.536 lower|head|upper 3 2.868
# 11: 58.510 61.510 61.404 63.562 58.536 61.482 lower|head|upper 3 0.078
# 12: 62.971 63.491 61.404 63.562 NA NA lower|head 2 0.520