Question

让我列出一个不重叠的基因组区间。

chr1    1   100
chr1    101 200
chr1    201 300
chr1    301 400

以及链接到不同样本的基因组位置列表：

chr1    50  sampleA
chr1    60  sampleB
chr1    110 sampleA
chr1    130 sampleB
chr1    160 sampleA
chr1    190 sampleC
chr1    350 sampleB
chr1    360 sampleB

我的目标是计算每个时间间隔内唯一样本的数量。在我的真实数据集中，间隔表是〜400.000行，而基因组位置样本表是〜30.000行。

此计算嵌入在模拟中，因此应尽可能快。我已经尝试使用GenomicRanges作为：

require(GenomicRanges)
interval.gr <- GRanges(intervals$chr,IRanges(intervals$start,intervals$end))
positions.gr <- GRanges(positions$chr,IRanges(positions$pos,positions$pos))
ov <- findOverlaps(interval.gr,positions.gr)
intervals %>%
  slice(queryHits(ov)) %>%
  mutate(sample=positions$sample[subjectHits(ov)]) %>% 
  group_by(chr,start,end) %>% 
  summarise(n_sample=length(unique(sample)))

结果

# A tibble: 3 x 4
# Groups:   chr, start [3]
  chr   start   end n_sample
  <fct> <dbl> <dbl>    <int>
1 chr1      1   100        2
2 chr1    101   200        3
3 chr1    301   400        1

但是，它仍然会丢失没有采样的间隔（201-300），而且速度也不是很快。使用我的数据集：

Unit: milliseconds
 expr      min      lq     mean   median       uq      max neval
    x 159.3901 161.621 190.1703 164.4879 168.3116 297.8395    10

我想知道是否有更好，更快的方法来进行这种分析？

谢谢

可复制的数据集：

intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),start=c(1,101,201,301),end=c(100,200,300,400))

positions <- data.frame(chr=rep("chr1",8),pos=c(50,60,110,130,160,190,350,360),sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))

编辑

可复制的数据集，其大小与我的真实数据集相同

intervals <- data.frame(chr=paste0("chr",round(runif(400000,min = 1,max = 22))),start=round(runif(n = 400000,min = 1,max = 100000000)))
intervals$end <- intervals$start+100

positions <- data.frame(chr=paste0("chr",round(runif(30000,min = 1,max = 22))),pos=round(runif(n = 30000,min = 1,max = 100000000)),sample=sample(paste0("sample",1:400),size = 30000,replace=T))

Answer 1

以@Jon所说的为基础，data.table是解决此问题的好方法。使用foverlaps（）函数可以大大提高速度。

library(data.table)
intervals <- data.frame(chr=c("chr1","chr1","chr1","chr1"),
                        start=c(1,101,201,301),
                        end=c(100,200,300,400))

positions <- data.frame(chr=rep("chr1",8),
                        pos=c(50,60,110,130,160,190,350,360),
                        sample=c("sampleA","sampleB","sampleA","sampleB","sampleA","sampleC","sampleB","sampleB"))
setDT(positions)
setDT(intervals)

  positions[, pos_tmp := pos]
  setkey(positions,chr, pos, pos_tmp)
  overlap = foverlaps(intervals, positions, type="any",by.x=c("chr","start", "end")) ## return overlap indices
  overlap[!is.na(sample),.(n_sample = .N), by = .(chr, start, end)]

与@Jon的实现相比，在我的机器上需要约6秒的时间，而上述实现则需要约180毫秒

Answer 2

对于较大的示例，我将尝试在data.table中使用非等价联接。样本中的基准测试结果比tmfmnk的答案差，但是对于更大的数据集，它可能具有较小的内存占用空间，并且速度更快。

library(data.table)
library(microbenchmark)

intervals <-
  data.table(
    chr = c("chr1", "chr1", "chr1", "chr1"),
    start = c(1, 101, 201, 301),
    end = c(100, 200, 300, 400)
  )
positions <-
  data.table(
    chr = rep("chr1", 8),
    pos = c(50, 60, 110, 130, 160, 190, 350, 360),
    sample = c(
      "sampleA",
      "sampleB",
      "sampleA",
      "sampleB",
      "sampleA",
      "sampleC",
      "sampleB",
      "sampleB"
    )
  )


# This takes only the ones we need:
positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
  chr = i.chr,
  start = i.start,
  end = i.end,
  sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]


# Benchmarking
microbenchmark(positions[intervals, on = .(chr == chr, pos <= end, pos >= start), .(
  chr = i.chr,
  start = i.start,
  end = i.end,
  sample = x.sample
)][, .(n_sample = sum(uniqueN(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
, times = 50)

让我知道它是否起作用。我进行了一些更改，以创建一个更接近您的用例的示例。还包括Bryan进行基准测试的案例。您的方法仍然是最快的。我对第四种方法（只有2个键）抱有希望，但它仍然较慢。

library(data.table)
library(microbenchmark)
library(GenomicRanges)
set.seed(14)
intervals <-
    data.table(chr = paste0("chr", round(runif(
        400000, min = 1, max = 22
    ))), start = round(runif(
        n = 400000, min = 1, max = 100000000
    )))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]

intervals$end <- intervals$start + 100L

# erase duplicates
intervals <-
    unique(intervals[, .SD, .SDcols = c("chr", "start", "end")])

positions <-
    data.table(
        chr = paste0("chr", round(runif(
            30000, min = 1, max = 22
        ))),
        pos = round(runif(
            n = 30000, min = 1, max = 100000000
        )),
        sample = sample(paste0("sample", 1:400), size = 30000, replace = T)
    )

setkeyv(intervals, c("chr", "start", "end"))
positions[, pos2 := pos]
setkeyv(positions, c("chr", "pos", "pos2"))


microbenchmark({
    interval.gr <-
        GRanges(intervals$chr, IRanges(intervals$start, intervals$end))
    positions.gr <-
        GRanges(positions$chr, IRanges(positions$pos, positions$pos))
    ov <- findOverlaps(interval.gr, positions.gr)
    res1 <- intervals %>%
        slice(queryHits(ov)) %>%
        mutate(sample = positions$sample[subjectHits(ov)]) %>%
        group_by(chr, start, end) %>%
        summarise(n_sample = length(unique(sample))) %>% data.table(.)

    intervals_1 <-
        rbind(res1, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]
}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 expr
#>  {     interval.gr <- GRanges(intervals$chr, IRanges(intervals$start,          intervals$end))     positions.gr <- GRanges(positions$chr, IRanges(positions$pos,          positions$pos))     ov <- findOverlaps(interval.gr, positions.gr)     res1 <- intervals %>% slice(queryHits(ov)) %>% mutate(sample = positions$sample[subjectHits(ov)]) %>%          group_by(chr, start, end) %>% summarise(n_sample = length(unique(sample))) %>%          data.table(.)     intervals_1 <- rbind(res1, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>       min       lq     mean   median       uq      max neval
#>  148.0612 164.4884 225.8665 178.5495 219.6038 585.5995    50


microbenchmark({
    # This takes only the ones we need:
    res2 <-
        positions[intervals, on = .(chr == chr, pos <= end, pos >= start), nomatch = 0L, .(
            chr = i.chr,
            start = i.start,
            end = i.end,
            sample = x.sample
        )][, .(n_sample = length(unique(sample, na.rm = TRUE))), by = c("chr", "start", "end")]
    intervals_2 <-
        rbind(res2, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]

}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                                                                             expr
#>  {     res2 <- positions[intervals, on = .(chr == chr, pos <= end,          pos >= start), nomatch = 0L, .(chr = i.chr, start = i.start,          end = i.end, sample = x.sample)][, .(n_sample = length(unique(sample,          na.rm = TRUE))), by = c("chr", "start", "end")]     intervals_2 <- rbind(res2, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>       min       lq     mean   median       uq      max neval
#>  294.4081 338.0309 447.0925 383.2813 548.3734 883.4389    50

microbenchmark({
    # Bryan's approach
    overlap = foverlaps(intervals,
                        positions,
                        type = "any",
                        by.x = c("chr", "start", "end")) ## return overlap indices
    res3 <-
        overlap[!is.na(sample), .(n_sample = length(unique(sample))), by = .(chr, start, end)]
    intervals_3 <-
        rbind(res3, intervals, fill = TRUE)[, sum(n_sample, na.rm = TRUE), by = c("chr", "start", "end")]

}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                                                                                                 expr
#>  {     overlap = foverlaps(intervals, positions, type = "any", by.x = c("chr",          "start", "end"))     res3 <- overlap[!is.na(sample), .(n_sample = length(unique(sample))),          by = .(chr, start, end)]     intervals_3 <- rbind(res3, intervals, fill = TRUE)[, sum(n_sample,          na.rm = TRUE), by = c("chr", "start", "end")] }
#>      min       lq    mean   median       uq      max neval
#>  221.907 280.8795 429.961 398.0075 475.0276 1017.866    50


## Starting by 1

set.seed(14)
intervals <-
    data.table(chr = paste0("chr", round(runif(
        400000, min = 1, max = 22
    ))), start = round(runif(
        n = 400000, min = 1, max = 100000000
    )))
# modified to start always with 1
intervals[, start := floor(start / 100) * 100 + 1L]


positions[, start := floor(pos / 100) * 100 + 1L]
positions <-
    unique(positions[, .SD, .SDcols = c("chr", "start", "sample")])
setkeyv(intervals, c("chr", "start"))
setkeyv(positions, c("chr", "start"))
microbenchmark({
    intervals_4 <-
        positions[intervals][, .(n_sample = sum(!is.na(sample))), by = c("chr", "start")]#
    # add end
    intervals_4[, end := start + 100L]
}, times = 50)
#> Unit: milliseconds
#>                                                                                                                                                          expr
#>  {     intervals_4 <- positions[intervals][, .(n_sample = sum(!is.na(sample))),          by = c("chr", "start")]     intervals_4[, `:=`(end, start + 100L)] }
#>       min       lq     mean   median       uq      max neval
#>  419.1001 443.9567 546.0186 478.5985 650.4682 1115.179    50

^{由reprex package（v0.3.0）于2019-09-25创建}

R：计算每个间隔的唯一元素

2 个答案: