我有两个数据文件如下:
头(RNA)
Gene_ID chr start end
1 ENSG00000000003.1 X 99883667 99884983
2 ENSG00000000003.2 X 99885756 99885863
3 ENSG00000000003.3 X 99887482 99887565
4 ENSG00000000003.4 X 99888402 99888536
5 ENSG00000000003.5 X 99888928 99889026
6 ENSG00000000003.6 X 99890175 99890249
头(SNP)
chr start end SNP_No
1 1 58812 58812 SNP_1
2 1 67230 67230 SNP_2
3 1 79529 79529 SNP_3
4 1 79595 79595 SNP_4
5 1 85665 85665 SNP_5
6 1 86064 86064 SNP_6
我想找到snp文件和RNA文件之间的重叠,所以我使用了GenomicRanges R包,我已经完成了以下命令:
gr_RNA <- GRanges(seqnames=RNA$chr,IRanges(start=RNA$start,end=RNA$end,names=RNA$Gene_ID))
gr_SNP <- GRanges(seqnames=SNP$chr, IRanges(start=SNP$start,end=SNP$end,names=SNP$SNP_No))
overlaps <- findOverlaps(gr_RNA, gr_SNP)
subsetByOver <- subsetByOverlaps(gr_RNA, gr_SNP)
match_hit <- data.frame(names(gr_RNA)[queryHits(overlaps)],names(gr_SNP)[subjectHits(overlaps)],stringsAsFactors=F)
names(match_hit) <- c('Gene_ID','SNP')
head(match_hit)
Gene_ID SNP
1 ENSG00000000457.1 SNP_307301
2 ENSG00000000457.2 SNP_307307
3 ENSG00000000457.11 SNP_307365
4 ENSG00000000457.12 SNP_307387
5 ENSG00000000460.1 SNP_306845
6 ENSG00000000460.1 SNP_306846
dim(match_hit)
[1] 12287 2
然后我从RNA文件(&#34; start-100&#34;&#34; end + 100&#34;)扩展了开始和结束位置的距离,并再次运行脚本,如下所示:
gr_RNA1 <- GRanges(seqnames=RNA$chr, IRanges(start=(RNA$start)-100, end=(RNA$end)+100, names=RNA$Gene_ID))
overlaps <- findOverlaps(gr_RNA1, gr_SNP)
subsetByOver<-subsetByOverlaps(gr_RNA1, gr_SNP)
match_hit1 <- data.frame(names(gr_RNA1)[queryHits(overlaps)],names(gr_SNP)[subjectHits(overlaps)],stringsAsFactors=F)
dim(match_hit1)
[1] 17976 2
现在,我想实现一个带有RNA表,SNP表和扩展距离的函数,然后给出最终结果。