让我列出一个不重叠的基因组区间。
chr1 1 100
chr1 101 200
chr1 201 300
chr1 301 400
以及链接到不同样本的基因组位置列表:
chr1 50 sampleA
chr1 60 sampleB
chr1 110 sampleA
chr1 130 sampleB
chr1 160 sampleA
chr1 190 sampleC
chr1 350 sampleB
chr1 360 sampleB
我的目标是计算每个时间间隔内唯一样本的数量。在我的真实数据集中,间隔表是〜400.000行,而基因组位置样本表是〜30.000行。
此计算嵌入在模拟中,因此应尽可能快。我已经尝试使用GenomicRanges作为:
require(GenomicRanges)
interval.gr <- GRanges(intervals$chr,IRanges(intervals$start,intervals$end))
positions.gr <- GRanges(positions$chr,IRanges(positions$pos,positions$pos))
ov <- findOverlaps(interval.gr,positions.gr)
intervals %>%
slice(queryHits(ov)) %>%
mutate(sample=positions$sample[subjectHits(ov)]) %>%
group_by(chr,start,end) %>%
summarise(n_sample=length(unique(sample)))
结果
# A tibble: 3 x 4
# Groups: chr, start [3]
chr start end n_sample
<fct> <dbl> <dbl> <int>
1 chr1 1 100 2
2 chr1 101 200 3
3 chr1 301 400 1
但是,它仍然会丢失没有采样的间隔(201-300),而且速度也不是很快。使用我的数据集:
Unit: milliseconds
expr min lq mean median uq max neval
x 159.3901 161.621 190.1703 164.4879 168.3116 297.8395 10
我想知道是否有更好,更快的方法来进行这种分析?
谢谢
可复制的数据集:
intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),start=c(1,101,201,301),end=c(100,200,300,400))
positions <- data.frame(chr=rep("chr1",8),pos=c(50,60,110,130,160,190,350,360),sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))
编辑
可复制的数据集,其大小与我的真实数据集相同
intervals <- data.frame(chr=paste0("chr",round(runif(400000,min = 1,max = 22))),start=round(runif(n = 400000,min = 1,max = 100000000)))
intervals$end <- intervals$start+100
positions <- data.frame(chr=paste0("chr",round(runif(30000,min = 1,max = 22))),pos=round(runif(n = 30000,min = 1,max = 100000000)),sample=sample(paste0("sample",1:400),size = 30000,replace=T))
答案 0 :(得分:2)
以@Jon所说的为基础,data.table是解决此问题的好方法。使用foverlaps()函数可以大大提高速度。
library(data.table)
intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),
start=c(1,101,201,301),
end=c(100,200,300,400))
positions <- data.frame(chr=rep("chr1",8),
pos=c(50,60,110,130,160,190,350,360),
sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))
setDT(positions)
setDT(intervals)
positions[, pos_tmp := pos]
setkey(positions,chr, pos, pos_tmp)
overlap = foverlaps(intervals, positions, type="any",by.x=c("chr","start", "end")) ## return overlap indices
overlap[!is.na(sample),.(n_sample = .N), by = .(chr, start, end)]
与@Jon的实现相比,在我的机器上需要约6秒的时间,而上述实现则需要约180毫秒
答案 1 :(得分:0)
对于较大的示例,我将尝试在data.table
中使用非等价联接。
样本中的基准测试结果比tmfmnk的答案差,但是对于更大的数据集,它可能具有较小的内存占用空间,并且速度更快。
library(data.table)
library(microbenchmark)
intervals <-
data.table(
chr = c("chr1", "chr1", "chr1", "chr1"),
start = c(1, 101, 201, 301),
end = c(100, 200, 300, 400)
)
positions <-
data.table(
chr = rep("chr1", 8),
pos = c(50, 60, 110, 130, 160, 190, 350, 360),
sample = c(
"sampleA",
"sampleB",
"sampleA",
"sampleB",
"sampleA",
"sampleC",
"sampleB",
"sampleB"
)
)
# This takes only the ones we need:
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
chr = i.chr,
start = i.start,
end = i.end,
sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
# Benchmarking
microbenchmark(positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
chr = i.chr,
start = i.start,
end = i.end,
sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
, times = 50)
让我知道它是否起作用。 我进行了一些更改,以创建一个更接近您的用例的示例。 还包括Bryan进行基准测试的案例。您的方法仍然是最快的。我对第四种方法(只有2个键)抱有希望,但它仍然较慢。
library(data.table)
library(microbenchmark)
library(GenomicRanges)
set.seed(14)
intervals <-
data.table(chr = paste0("chr", round(runif(
400000, min = 1, max = 22
))), start = round(runif(
n = 400000, min = 1, max = 100000000
)))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]
intervals$end <- intervals$start + 100L
# erase duplicates
intervals <-
unique(intervals[, .SD, .SDcols = c("chr", "start", "end")])
positions <-
data.table(
chr = paste0("chr", round(runif(
30000, min = 1, max = 22
))),
pos = round(runif(
n = 30000, min = 1, max = 100000000
)),
sample = sample(paste0("sample", 1:400), size = 30000, replace = T)
)
setkeyv(intervals, c("chr", "start", "end"))
positions[, pos2 := pos]
setkeyv(positions, c("chr", "pos", "pos2"))
microbenchmark({
interval.gr <-
GRanges(intervals$chr, IRanges(intervals$start, intervals$end))
positions.gr <-
GRanges(positions$chr, IRanges(positions$pos, positions$pos))
ov <- findOverlaps(interval.gr, positions.gr)
res1 <- intervals %>%
slice(queryHits(ov)) %>%
mutate(sample = positions$sample[subjectHits(ov)]) %>%
group_by(chr, start, end) %>%
summarise(n_sample = length(unique(sample))) %>% data.table(.)
intervals_1 <-
rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { interval.gr <- GRanges(intervals$chr, IRanges(intervals$start, intervals$end)) positions.gr <- GRanges(positions$chr, IRanges(positions$pos, positions$pos)) ov <- findOverlaps(interval.gr, positions.gr) res1 <- intervals %>% slice(queryHits(ov)) %>% mutate(sample = positions$sample[subjectHits(ov)]) %>% group_by(chr, start, end) %>% summarise(n_sample = length(unique(sample))) %>% data.table(.) intervals_1 <- rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")] }
#> min lq mean median uq max neval
#> 148.0612 164.4884 225.8665 178.5495 219.6038 585.5995 50
microbenchmark({
# This takes only the ones we need:
res2 <-
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(
chr = i.chr,
start = i.start,
end = i.end,
sample = x.sample
)][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
intervals_2 <-
rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { res2 <- positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(chr = i.chr, start = i.start, end = i.end, sample = x.sample)][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")] intervals_2 <- rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")] }
#> min lq mean median uq max neval
#> 294.4081 338.0309 447.0925 383.2813 548.3734 883.4389 50
microbenchmark({
# Bryan's approach
overlap = foverlaps(intervals,
positions,
type = "any",
by.x = c("chr", "start", "end")) ## return overlap indices
res3 <-
overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)]
intervals_3 <-
rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { overlap = foverlaps(intervals, positions, type = "any", by.x = c("chr", "start", "end")) res3 <- overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)] intervals_3 <- rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")] }
#> min lq mean median uq max neval
#> 221.907 280.8795 429.961 398.0075 475.0276 1017.866 50
## Starting by 1
set.seed(14)
intervals <-
data.table(chr = paste0("chr", round(runif(
400000, min = 1, max = 22
))), start = round(runif(
n = 400000, min = 1, max = 100000000
)))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]
positions[, start := floor(pos / 100) * 100 + 1L]
positions <-
unique(positions[, .SD, .SDcols = c("chr", "start", "sample")])
setkeyv(intervals, c("chr", "start"))
setkeyv(positions, c("chr", "start"))
microbenchmark({
intervals_4 <-
positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")]#
# add end
intervals_4[, end := start + 100L]
}, times = 50)
#> Unit: milliseconds
#> expr
#> { intervals_4 <- positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")] intervals_4[, `:=`(end, start + 100L)] }
#> min lq mean median uq max neval
#> 419.1001 443.9567 546.0186 478.5985 650.4682 1115.179 50
由reprex package(v0.3.0)于2019-09-25创建