Question

我有两个矩阵，例如

我想得到一些路口

 > head(small[1:3,])
  Chrom1    Start1      End1 Strand1 Chrom2    Start2      End2
1      1  28677074  28677079       +      1  28706324  28706329
2      1 186383731 186383738       +      1 186383731 186383732
3      1 245902589 245902590       +      1 246007384 246007385
  Strand2
1       -
2       -
3       -
> 
    > dim(small)
[1] 1594    8

https://www.dropbox.com/s/mkplrg1236f8qtd/111.txt?dl=0

还有一个大矩阵

> head(big[1:3,])
  Ensembl_Gene_ID Chromosome_Name Gene_Start Gene_End
1 ENSG00000233440              13   23708313 23708703
2 ENSG00000207157              13   23726725 23726825
3 ENSG00000229483              13   23743974 23744736
  Associated_Gene_Name X5UTR_Start X5UTR_End X3UTR_Start X3UTR_End
1              HMGA1P6          NA        NA          NA        NA
2               RNY3P4          NA        NA          NA        NA
3            LINC00362          NA        NA          NA        NA
  Transcript_count Gene_Biotype Exon_Chr_Start Exon_Chr_End
1                1   pseudogene       23708313     23708703
2                1     misc_RNA       23726725     23726825
3                1      lincRNA       23744691     23744736
  Exon_Rank_in_Transcript Ensembl_Transcript_ID Transcript_Start
1                       1       ENST00000418454         23708313
2                       1       ENST00000384428         23726725
3                       1       ENST00000414345         23743974
  Transcript_End strand
1       23708703      1
2       23726825     -1
3       23744736     -1
> dim(big)
[1] 1048575      18
>

https://www.dropbox.com/s/bit4iw2ne19td63/big.txt?dl=0

我需要这样的东西

Chrom1  Start1  End1    Strand1 Chrom2  Start2  End2    Strand2 GeneName.node1  GeneName.node2
chr1    14480603    14481217    +   chr1    14747789    14748719    -   -   -
chr1    16169956    16170596    +   chr1    16217823    16218463    -   RP11-169K16.9   SPEN

我有类似R的脚本

#### Determining breakpoint locations: and adding to table

small$breakpoint1 <- apply(small[,c("Strand1","Start1","End1")], 1,
                             function(x) ifelse(x[1] == "+",as.numeric(x[3]),
                                                as.numeric(x[2])))
small$breakpoint2 <- apply(small[,c("Strand2","Start2","End2")], 1,
                             function(x) ifelse(x[1] == "+",as.numeric(x[3]),
                                                as.numeric(x[2])))
svinfo$breakpoint1.ordered <- apply(svinfo[,c("breakpoint1","breakpoint2")],1,
                                     function(x) min(as.numeric(x[1]),as.numeric(x[2])))
svinfo$breakpoint2.ordered <- apply(svinfo[,c("breakpoint1","breakpoint2")],1,
                                     function(x) max(as.numeric(x[1]),as.numeric(x[2])))

  #######
  ### Start SV annotation:
  gr.hg19.gene <-  GRanges(
    seqnames = Rle(hg19$Chromosome_Name),
    ranges = IRanges(start=as.numeric(hg19$Gene_Start), end = as.numeric(hg19$Gene_End)),
    EnsemblGeneID = hg19$Ensembl_Gene_ID,
    GeneName = hg19$Associated_Gene_Name,
    TranscriptCount = hg19$Transcript_count,
    ExonChrStart = hg19$Exon_Chr_Start,
    ExonChrEnd = hg19$Exon_Chr_End,
    ExonRankInTranscript = hg19$Exon_Rank_in_Transcript,
    EnsemblTranscriptID = hg19$Ensembl_Transcript_ID,
    TranscriptStart = hg19$Transcript_Start,
    TranscriptEnd = hg19$Transcript_End,
    GeneStrand = hg19$Strand,
    ExonID = paste(hg19$Exon_Chr_Start,hg19$Exon_Chr_End,sep=".")
  )

  gr.svinfo.node1 <- GRanges(
    seqnames = Rle(svinfo$Chrom1),
    ranges = IRanges(start=as.numeric(svinfo$breakpoint1), end = as.numeric(svinfo$breakpoint1)),
    Type = svinfo$Type,
    Node1.Strand = svinfo$Strand1,
    ID = svinfo$ID
  )
  gr.svinfo.node2 <- GRanges(
    seqnames = Rle(svinfo$Chrom2),
    ranges = IRanges(start=as.numeric(svinfo$breakpoint2), end = as.numeric(svinfo$breakpoint2)),
    Type = svinfo$Type,
    Node2.Strand = svinfo$Strand2,
    ID = svinfo$ID
  )

但是我不知道如何获得与小矩阵各个部分相关的基因

有人可以帮助我吗？

Answer 1

以下应作为第一步。首先加载您的（修改后的）数据示例：

small <- read.table(text = "Chromosome chromStart  chromEnd
1          1   28677074  28677079
2          1  186383731 186383738
3          1  245902589 245902590
4          2   56345643  56345645
5          3   59766214  59766217
6          3   60270545  60270548")

big <- read.table(text = "
Chromosome chromStart chromEnd      Gene
1         1   28677075 28677078   HMGA1P6
2         13   23726725 23726825    RNY3P4
3         13   23743974 23744736 LINC00362
4         13   23743974 23744736 LINC00362
5         13   23791571 23791673  RNU6-58P
6         13   23817659 23821323  TATDN2P3")

接下来，是用于识别与该区域相对应的te基因的代码。

small$Gene <- NA  # Initialize an "empty" colum to fill
for (i in seq_len(nrow(small))) {
  # Find indicies where the genes falls into the chromosome and region
  j <- which(big$Chromosome == small[i, "Chromosome"] &
               big$chromStart >= small[i, "chromStart"] &
               big$chromEnd <= small[i, "chromEnd"])

  # Fetch the gene corresponding to the indicies and collapse (if more than one)
  small[i, "Gene"] <- paste(big$Gene[j], collapse = ";")
}

print(small)
#  Chromosome chromStart  chromEnd    Gene
#1          1   28677074  28677079 HMGA1P6
#2          1  186383731 186383738        
#3          1  245902589 245902590        
#4          2   56345643  56345645        
#5          3   59766214  59766217        
#6          3   60270545  60270548

当然，使用for循环可能不是最佳选择。但是请注意，我们循环了small矩阵，并通过比较small中的每一行与big中的所有行来利用矢量化。即使在完整数据上也应该很快。

由于您需要将每个基因与染色体上每个区域进行比较，因此您可以为提高速度而优化代码。

我已经“解释”了一个以上的基因可能落在small记录所定义的区域中的可能性。

编辑：

如果您只是在寻找染色体区域和基因的“重叠”，则需要像上面这样定义j：

j <- which(
  big$Chromosome == small[i, "Chromosome"] & (
    (small[i, "chromStart"] <= big$chromStart & big$chromStart <= small[i, "chromEnc"]) | # Gene starts in region
    (small[i, "chromStart"] <= big$chromEnd & big$chromEnd <= small[i, "chromEnd"])  # Gene ends in region
  ) 
)

如果我没记错的话。基本上，这应该检查基因是否在该区域内以或开头。

Answer 2

好像来自data.table的foverlaps很有用。查看此答案：

Finding Overlaps between interval sets / Efficient Overlap Joins

使用@Anders Ellern Bilgrau的数据集，它实现了foverlaps

library(data.table)

setDT(small)
setDT(big)

setkey(big, Chromosome, chromStart, chromEnd)

foverlaps(small, big)

#   Chromosome chromStart chromEnd    Gene i.chromStart i.chromEnd
#1:          1   28677075 28677078 HMGA1P6     28677074   28677079
#2:          1         NA       NA    <NA>    186383731  186383738
#3:          1         NA       NA    <NA>    245902589  245902590
#4:          2         NA       NA    <NA>     56345643   56345645
#5:          3         NA       NA    <NA>     59766214   59766217
#6:          3         NA       NA    <NA>     60270545   60270548

合并大小不同的两个矩阵

2 个答案: