我正在尝试将来自多个数据框的信息组合起来填充一个大数据框。第一个数据框就像是对所有人类基因的概述:
gene_id chromosome gene_start gene_end
ENSG00000116396 1 110753965 110825722
ENSG00000228217 1 118320709 118321128
ENSG00000261716 1 149816065 149820591
ENSG00000223562 1 211355498 211356446
ENSG00000239859 1 36171626 36171875
ENSG00000197921 1 2460184 2461684
ENSG00000232237 1 201083081 201096312
ENSG00000212257 1 65488651 65488757
ENSG00000158887 1 161274525 161279762
ENSG00000238122 1 108803818 108816311
ENSG00000215846 1 159246293 159247282
ENSG00000266763 1 26238240 26238313
ENSG00000228634 1 32398621 32399576
ENSG00000177614 1 230457392 230561475
ENSG00000163462 1 155145873 155157447
ENSG00000204481 1 13668269 13673511
其次,我有许多文件(其中516个),包含如下数据:
Sample Chromosome Start End Num_Probes Segment_Mean
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 61735 82170 9 0.2560
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 82315 16869363 8678 -0.1199
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 16871278 17087292 85 -0.5386
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 17089349 17209603 23 -0.0807
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 17210652 17262232 57 0.2680
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 17262247 25583341 5240 -0.1228
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 25593128 25646986 28 -1.8216
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 25661501 30738534 2398 -0.0942
UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 30739299 30745210 7 -1.3117
现在,我想创建某种循环,将每个基因放在第一个数据框中。然后我想在检查基因位置的同时遍历所有516个数据帧。
因此,对于每个基因和每个文件,我想比较基因的开始和结束与每个样本片段的开始和结束,前提是它们位于同一染色体上。如果是这种情况,我想采用段均值并将其放在一个新的大数据帧中,其中gene_id为rownames,文件名为列名。
这是我已有的代码:
for(gene in 1:100){
gene_id <- genome[gene, 1]
chromosome <- genome[gene, 2]
gene_start <- genome[gene, 3]
gene_end <- genome[gene, 4]
for(name in 1:length(dataframe_names)){
df <- get(dataframe_names[name])
for(segment in 1:nrow(df)){
if(chromosome == as.character(df[segment,2])){
if(gene_start > df[segment,3] && gene_start < df[segment,4] && gene_end > df[segment,3] && gene_end < df[segment,4]){
data_matrix[gene,name] <- df[segment, 6]
}
}
}
}
}
这段代码有效,但考虑到有57 773个基因,它真的很慢。用100个基因进行的测试耗时1小时,因此完整运行可能需要2-3周......
我认为使用apply
- 家庭会加快速度,但我以前从未使用它们,所以我真的不知道怎么做。互联网上的示例总是使用sum
或mean
之类的东西,但我不想要这些东西,我只想比较这些数字。另外,我不知道哪个apply
家族最适合我的需求。
你们想帮助我或走上正轨吗?
答案 0 :(得分:3)
你需要做的第一件事就是停止重新发明轮子。熟悉bioconductor包IRanges
,GenomicRanges
(以及Biostrings
的完整性,但不是针对此特定问题)。获得GenomicRanges后,请查看findOverlaps系列函数。
由于您的exmple数据实际上没有任何重叠,我修改它就像这样
df1 <- structure(list(gene_id = structure(c(2L, 5L, 1L, 3L, 7L, 6L,
4L), .Label = c("ENSG00000207157", "ENSG00000223116", "ENSG00000229483",
"ENSG00000232849", "ENSG00000233440", "ENSG00000235205", "ENSG00000252952"
), class = "factor"), chromosome = c(13L, 13L, 13L, 13L, 13L,
13L, 13L), gene_start = c(23551994L, 23708313L, 23726725L, 23743974L,
23791571L, 23817659L, 93708910L), gene_end = c(23552136L, 23708703L,
23726825L, 23744736L, 23791673L, 23821323L, 93710179L)), .Names = c("gene_id",
"chromosome", "gene_start", "gene_end"), class = "data.frame", row.names = c(NA,
-7L))
df2 <- structure(list(Sample = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930", class = "factor"),
Chromosome = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Start = c(61735L,
82315L, 16871278L, 17089349L, 17210652L, 17262247L, 25593128L,
25661501L, 30739299L), End = c(82170L, 16869363L, 17087292L,
17209603L, 17262232L, 25583341L, 25646986L, 30738534L, 30745210L
), Num_Probes = c(9L, 8678L, 85L, 23L, 57L, 5240L, 28L, 2398L,
7L), Segment_Mean = c(0.256, -0.1199, -0.5386, -0.0807, 0.268,
-0.1228, -1.8216, -0.0942, -1.3117)), .Names = c("Sample",
"Chromosome", "Start", "End", "Num_Probes", "Segment_Mean"), class = "data.frame", row.names = c(NA,
-9L))
df1$chromosome <- 1
df1[6,4] <- 30745215 # to show what happens when there are multiple overlaps
现在获取重叠
library(GenomicRanges)
Genes <- GRanges(df1$chromosome,
IRanges(df1$gene_start, df1$gene_end), genes=df1$gene_id)
# make a list of the 512 others, and read 1 one for example
files <- list.files(pattern="csv") # assuming they are .csv files
snps0 <- read.csv(files[[1]])
snps <- GRanges(snps0$Chromosome, IRanges(snps0$Start, snps0$End),
Segment_Mean=snps0$Segment_Mean)
olaps <- findOverlaps(query=snps, subject=Genes)
一旦有了重叠,就可以使用它作为合并原始数据框的基础
olaps2 <- as.data.frame(olaps)
df1$Row <- rownames(df1)
NewDF <- merge(df1,olaps2,by.x="Row",by.y="subjectHits",all=T,sort=F)
df2$Row <- rownames(df2)
NewDF2 <- merge(NewDF,df2,by.x="queryHits",by.y="Row",all.x=T,sort=F)[,c(-1,-2)]
# drop the first 2 columns because they were just temporary for merging purposes
head(NewDF2,2)
gene_id chromosome gene_start gene_end
1 ENSG00000223116 1 23551994 23552136
2 ENSG00000207157 1 23726725 23726825
Sample Chromosome Start
1 UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 17262247
2 UNDID_p_TCGA_353_354_355_37_NSP_GenomeWideSNP_6_H10_1376930 1 17262247
End Num_Probes Segment_Mean
1 25583341 5240 -0.1228
2 25583341 5240 -0.1228