我有两个矩阵,例如
我想得到一些路口
> head(small[1:3,])
Chrom1 Start1 End1 Strand1 Chrom2 Start2 End2
1 1 28677074 28677079 + 1 28706324 28706329
2 1 186383731 186383738 + 1 186383731 186383732
3 1 245902589 245902590 + 1 246007384 246007385
Strand2
1 -
2 -
3 -
>
> dim(small)
[1] 1594 8
https://www.dropbox.com/s/mkplrg1236f8qtd/111.txt?dl=0
还有一个大矩阵
> head(big[1:3,])
Ensembl_Gene_ID Chromosome_Name Gene_Start Gene_End
1 ENSG00000233440 13 23708313 23708703
2 ENSG00000207157 13 23726725 23726825
3 ENSG00000229483 13 23743974 23744736
Associated_Gene_Name X5UTR_Start X5UTR_End X3UTR_Start X3UTR_End
1 HMGA1P6 NA NA NA NA
2 RNY3P4 NA NA NA NA
3 LINC00362 NA NA NA NA
Transcript_count Gene_Biotype Exon_Chr_Start Exon_Chr_End
1 1 pseudogene 23708313 23708703
2 1 misc_RNA 23726725 23726825
3 1 lincRNA 23744691 23744736
Exon_Rank_in_Transcript Ensembl_Transcript_ID Transcript_Start
1 1 ENST00000418454 23708313
2 1 ENST00000384428 23726725
3 1 ENST00000414345 23743974
Transcript_End strand
1 23708703 1
2 23726825 -1
3 23744736 -1
> dim(big)
[1] 1048575 18
>
https://www.dropbox.com/s/bit4iw2ne19td63/big.txt?dl=0
我需要这样的东西
Chrom1 Start1 End1 Strand1 Chrom2 Start2 End2 Strand2 GeneName.node1 GeneName.node2
chr1 14480603 14481217 + chr1 14747789 14748719 - - -
chr1 16169956 16170596 + chr1 16217823 16218463 - RP11-169K16.9 SPEN
我有类似R的脚本
#### Determining breakpoint locations: and adding to table
small$breakpoint1 <- apply(small[,c("Strand1","Start1","End1")], 1,
function(x) ifelse(x[1] == "+",as.numeric(x[3]),
as.numeric(x[2])))
small$breakpoint2 <- apply(small[,c("Strand2","Start2","End2")], 1,
function(x) ifelse(x[1] == "+",as.numeric(x[3]),
as.numeric(x[2])))
svinfo$breakpoint1.ordered <- apply(svinfo[,c("breakpoint1","breakpoint2")],1,
function(x) min(as.numeric(x[1]),as.numeric(x[2])))
svinfo$breakpoint2.ordered <- apply(svinfo[,c("breakpoint1","breakpoint2")],1,
function(x) max(as.numeric(x[1]),as.numeric(x[2])))
#######
### Start SV annotation:
gr.hg19.gene <- GRanges(
seqnames = Rle(hg19$Chromosome_Name),
ranges = IRanges(start=as.numeric(hg19$Gene_Start), end = as.numeric(hg19$Gene_End)),
EnsemblGeneID = hg19$Ensembl_Gene_ID,
GeneName = hg19$Associated_Gene_Name,
TranscriptCount = hg19$Transcript_count,
ExonChrStart = hg19$Exon_Chr_Start,
ExonChrEnd = hg19$Exon_Chr_End,
ExonRankInTranscript = hg19$Exon_Rank_in_Transcript,
EnsemblTranscriptID = hg19$Ensembl_Transcript_ID,
TranscriptStart = hg19$Transcript_Start,
TranscriptEnd = hg19$Transcript_End,
GeneStrand = hg19$Strand,
ExonID = paste(hg19$Exon_Chr_Start,hg19$Exon_Chr_End,sep=".")
)
gr.svinfo.node1 <- GRanges(
seqnames = Rle(svinfo$Chrom1),
ranges = IRanges(start=as.numeric(svinfo$breakpoint1), end = as.numeric(svinfo$breakpoint1)),
Type = svinfo$Type,
Node1.Strand = svinfo$Strand1,
ID = svinfo$ID
)
gr.svinfo.node2 <- GRanges(
seqnames = Rle(svinfo$Chrom2),
ranges = IRanges(start=as.numeric(svinfo$breakpoint2), end = as.numeric(svinfo$breakpoint2)),
Type = svinfo$Type,
Node2.Strand = svinfo$Strand2,
ID = svinfo$ID
)
但是我不知道如何获得与小矩阵各个部分相关的基因
有人可以帮助我吗?
答案 0 :(得分:2)
以下应作为第一步。首先加载您的(修改后的)数据示例:
small <- read.table(text = "Chromosome chromStart chromEnd
1 1 28677074 28677079
2 1 186383731 186383738
3 1 245902589 245902590
4 2 56345643 56345645
5 3 59766214 59766217
6 3 60270545 60270548")
big <- read.table(text = "
Chromosome chromStart chromEnd Gene
1 1 28677075 28677078 HMGA1P6
2 13 23726725 23726825 RNY3P4
3 13 23743974 23744736 LINC00362
4 13 23743974 23744736 LINC00362
5 13 23791571 23791673 RNU6-58P
6 13 23817659 23821323 TATDN2P3")
接下来,是用于识别与该区域相对应的te基因的代码。
small$Gene <- NA # Initialize an "empty" colum to fill
for (i in seq_len(nrow(small))) {
# Find indicies where the genes falls into the chromosome and region
j <- which(big$Chromosome == small[i, "Chromosome"] &
big$chromStart >= small[i, "chromStart"] &
big$chromEnd <= small[i, "chromEnd"])
# Fetch the gene corresponding to the indicies and collapse (if more than one)
small[i, "Gene"] <- paste(big$Gene[j], collapse = ";")
}
print(small)
# Chromosome chromStart chromEnd Gene
#1 1 28677074 28677079 HMGA1P6
#2 1 186383731 186383738
#3 1 245902589 245902590
#4 2 56345643 56345645
#5 3 59766214 59766217
#6 3 60270545 60270548
当然,使用for循环可能不是最佳选择。但是请注意,我们循环了small
矩阵,并通过比较small
中的每一行与big
中的所有行来利用矢量化。即使在完整数据上也应该很快。
由于您需要将每个基因与染色体上每个区域进行比较,因此您可以为提高速度而优化代码。
我已经“解释”了一个以上的基因可能落在small
记录所定义的区域中的可能性。
编辑:
如果您只是在寻找染色体区域和基因的“重叠”,则需要像上面这样定义j
:
j <- which(
big$Chromosome == small[i, "Chromosome"] & (
(small[i, "chromStart"] <= big$chromStart & big$chromStart <= small[i, "chromEnc"]) | # Gene starts in region
(small[i, "chromStart"] <= big$chromEnd & big$chromEnd <= small[i, "chromEnd"]) # Gene ends in region
)
)
如果我没记错的话。基本上,这应该检查基因是否在该区域内以或开头。
答案 1 :(得分:1)
好像来自data.table的foverlaps
很有用。查看此答案:
Finding Overlaps between interval sets / Efficient Overlap Joins
使用@Anders Ellern Bilgrau的数据集,它实现了foverlaps
library(data.table)
setDT(small)
setDT(big)
setkey(big, Chromosome, chromStart, chromEnd)
foverlaps(small, big)
# Chromosome chromStart chromEnd Gene i.chromStart i.chromEnd
#1: 1 28677075 28677078 HMGA1P6 28677074 28677079
#2: 1 NA NA <NA> 186383731 186383738
#3: 1 NA NA <NA> 245902589 245902590
#4: 2 NA NA <NA> 56345643 56345645
#5: 3 NA NA <NA> 59766214 59766217
#6: 3 NA NA <NA> 60270545 60270548