R中的data.frames与sapply之间的高效坐标匹配

时间:2014-01-31 03:57:14

标签: r sapply

我正在尝试获取一个向量,告诉我data.frame(transcriptcoords)中的哪些行

              chr  start  end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370

在另一个data.frame(genecoords)中松散地包含起始/结束坐标(具有+/- 10容差)

              chr  start  end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887

为此,我在第一个data.frame的行indece上进行sapply循环,使坐标与第二个data.frame中的任何行匹配。我有一个解决方案(如下所述),但它似乎相当慢(大约六秒钟,一行2000行):

   user  system elapsed 
   6.02    0.00    6.04

我试图了解可以优化哪些部分的sapply。是if / else块吗?或比较线(==,< =,> =)?或者更简单地说,它是一种本质上很慢的算法吗?

谢谢!我生成的代码如下:

load(url("http://www.giorgilab.org/stuff/data.rda"))

# Pre-vectorize the data frames
g0<-rownames(genecoords)
g1<-genecoords[,1]
g2<-as.integer(genecoords[,2])
g3<-as.integer(genecoords[,3])

t0<-rownames(transcriptcoords)
t1<-transcriptcoords[,1]
t2<-as.integer(transcriptcoords[,2])
t3<-as.integer(transcriptcoords[,3])

system.time(gs<-sapply(1:2000,function(i){
            t<-t0[i]
            chr<-t1[i]
            start<-t2[i]
            end<-t3[i]

            # Find a match (loose boundaries +/- 10)
            right1<-which(g1==chr)
            right2<-which(g2<=start+10)
            right3<-which(g3>=end-10)
            right<-intersect(right3,intersect(right1,right2))
            right<-g0[right]

            if(length(right)==1){
                g<-right
            } else if(length(right)>1){
                # Get the smallest match
                heregenecoords<-genecoords[right,]
                size<-apply(heregenecoords,1,function(x){abs(as.numeric(x[3])-as.numeric(x[2]))})
                g<-names(which.min(size))
            } else {
                g<-t
            }
            return(g)           
        }
))

3 个答案:

答案 0 :(得分:2)

使用您的数据

tx0 <- read.table(textConnection("chr  start  end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370"))

gene0 <- read.table(textConnection("chr  start  end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887"))

GenomicRanges中的Bioconductor包可轻松高效地完成此操作(数百万次重叠)。

library(GenomicRanges)
tx <- with(tx0, GRanges(chr, IRanges(start, end)))
gene <- with(gene0, GRanges(chr, IRanges(start, end)))

## increase width by 10 on both sides of the center of the gene range
gene <- resize(gene, width=width(gene) + 20, fix="center")
## find overlaps of 'query' tx and 'subject' gene, where query is within subject
olaps <- findOverlaps(tx, gene, type="within")

显示,例如,'query'(tx)1,2,3和4在'subject'(基因)1中。

> findOverlaps(tx, gene, type="within")
Hits of length 6
queryLength: 6
subjectLength: 6
  queryHits subjectHits 
   <integer>   <integer> 
 1         1           1 
 2         2           1 
 3         3           1 
 4         4           1 
 5         5           2 
 6         6           2 

并且该基因1与4个转录物重叠,基因2与2个转录物重叠。

> table(subjectHits(olaps))

1 2 
4 2 

另见publication。使用更大的数据集:

tx <- with(transcriptcoords, GRanges(V1, IRanges(V2, V3, names=rownames(tx0))))
gene <- with(genecoords, GRanges(V1, IRanges(V2, V3, names=rownames(gene0))))

有一些时间:

system.time(gene <- resize(gene, width=width(gene) + 20, fix="center"))
##   user  system elapsed 
##  0.056   0.000   0.057 
system.time(findOverlaps(tx, gene, type="within"))
##   user  system elapsed 
##  2.248   0.000   2.250 

我认为现在大约是来自@ danas.zuokos的data.table解决方案的时间只有1000份成绩单

system.time({
    dt <- genecoords[transcriptcoords, allow.cartesian = TRUE]; 
    res <- dt[start <= start.1 + tol & end >= end.1 - tol, 
         list(gene = gene[which.min(size)]), by = transcript]
})
##    user  system elapsed 
##   2.148   0.244   2.400 

答案 1 :(得分:1)

哈!马丁用更好的答案打败了我。在一个完善的库中使用经过测试的代码而不是自己编写代码几乎总是更好。绝对使用Martin的解决方案,而不是这个。

但是,只是为了笑,这是另一种方法。

首先,编写一些基因和成绩单:

gs = 1:10*500
genes = data.frame(start=gs, end=gs+400)
rownames(genes) = sprintf('g%05d', 1:nrow(genes))

ts = sample(1:max(genes$end), size=10)
transcripts = data.frame(start=ts, end=ts+60)
rownames(transcripts) = sprintf('t%05d', 1:nrow(transcripts))

我们可以使用 outer 对比较进行矢量化,该函数将函数应用于其两个矢量参数的每个组合。

overlaps = function(genes, transcripts, min_overlap=1) {
  b1 = outer(genes$end, transcripts$start, min_overlap=min_overlap, 
             function(e,s,min_overlap) e-s+1>min_overlap)
  b2 = outer(genes$start, transcripts$end, min_overlap=min_overlap,
             function(s,e,min_overlap) e-s+1>min_overlap)
  result = b1 & b2
  rownames(result) = rownames(genes)
  colnames(result) = rownames(transcripts)
  return(result)
}

对于我们的基因和成绩单,我们可能会得到类似的结果:

> genes
       start  end
g00001   500  900
g00002  1000 1400
g00003  1500 1900
g00004  2000 2400
g00005  2500 2900
g00006  3000 3400
g00007  3500 3900
g00008  4000 4400
g00009  4500 4900
g00010  5000 5400

> transcripts
       start  end
t00001   535  595
t00002  2502 2562
t00003  4757 4817
t00004  3570 3630
t00005  3094 3154
t00006  1645 1705
t00007  5202 5262
t00008    13   73
t00009   788  848
t00010  4047 4107

o1 = overlaps(genes, transcripts, 1)

结果是一个布尔矩阵,告诉您每个转录本是否与每个基因重叠。

> o1
       t00001 t00002 t00003 t00004 t00005 t00006 t00007 t00008 t00009 t00010
g00001   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE
g00002  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00003  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE
g00004  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00005  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00006  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE
g00007  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00008  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE
g00009  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE
g00010  FALSE  FALSE  FALSE  FALSE  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE

答案 2 :(得分:1)

我正在使用data.table库。

rm(list = ls())
load(url("http://www.giorgilab.org/stuff/data.rda"))
library(data.table)
tol <- 10 # tolerance
id <- 1:2000 # you can comment this out, but make sure you have big RAM

转换为data.table格式。另外计算尺寸(我不确定你为什么选择abs,并不总是比开始大?)。

genecoords <- data.table(genecoords, keep.rownames = TRUE)
setnames(genecoords, c("gene", "chr", "start", "end"))
genecoords[, size := end - start]
transcriptcoords <- data.table(transcriptcoords, keep.rownames = TRUE)[id] # comment out [id] when running on all trascripts
setnames(transcriptcoords, c("transcript", "chr", "start", "end"))

这就是结果。

setkeyv(genecoords, "chr")
setkeyv(transcriptcoords, "chr")
dt <- genecoords[transcriptcoords, allow.cartesian = TRUE]
res <- dt[start <= start.1 + tol & end >= end.1 - tol, list(gene = gene[which.min(size)]), by = transcript]

意识到这不包括不符合条件的成绩单(代码中为g<-t)。