我正尝试使用 R 中的 phyper 功能执行浓缩分析。我写的代码给了我准确的结果,但是当矩阵的大小增加时它需要永远。以下是621 * 1860矩阵的可重现示例。但是当矩阵的大小增加到6210 X 24000时,即使我在多个核心上运行它,我也需要将近一天的时间才能完成。我想知道是否有办法优化相同的。
请从评论中的可共享链接下载三个 RObject 。
## Main Functions
GetEnrichedAnnotations <- function(DrugDiseaseName,
DrugDiseaseGeneMatrix,
DrugDiseaseFeatureMatrix,
DFDrgDis){ ## Function Begins
TotalGenesCount = nrow(DrugDiseaseGeneMatrix)
## Get the assosciated Genes for each Drug or Disease
DrugDiseaseGenes = GetGeneList(DrugDiseaseName,DFDrgDis)
## Get the only annotations that Genes from the Drug or Disease List
UPDNAnnotations = DrugDiseaseFeatureMatrix[DrugDiseaseName,]
UPDNAnnotations = UPDNAnnotations[UPDNAnnotations > 0]
## First value to the HyperGeometricFunction phyper
GenesFromInput = DrugDiseaseFeatureMatrix[DrugDiseaseName,names(UPDNAnnotations)]
## Second value to the HyperGeometricFunction phyper
GenesinAnnotation = DrugDiseaseGeneMatrix[,names(UPDNAnnotations)]
GenesinAnnotation = apply(GenesinAnnotation,2,sum)
## Third Value to the HyperGeometricFunction phyper
TotalGenes = rep(TotalGenesCount,length(GenesFromInput))
RemainingGenes = TotalGenes - GenesinAnnotation
## Fourth value to the HyperGeometricFunction phyper
NumberOfGenesInDrug = rep(length(DrugDiseaseGenes),length(GenesFromInput))
names(NumberOfGenesInDrug) = names(GenesFromInput)
## Apply Enrichment ANalysis
PValues = phyper(GenesFromInput-1,GenesinAnnotation,RemainingGenes,NumberOfGenesInDrug,lower.tail = FALSE)
AdjustedPvalues = p.adjust(PValues,method = "BH")
EnrichedAnnotations = AdjustedPvalues[AdjustedPvalues <= 0.05]
### When P value is zero, replacing zeros with the minimum value
EnrichedAnnotations[EnrichedAnnotations == 0] = 2.2e-16
EnrichedAnnotations = EnrichedAnnotations[EnrichedAnnotations <= 0.05]
## Get the log value for the adjusted Pvalues
EnrichedAnnotations = -log(EnrichedAnnotations,2)
## This vector consists of all the annotations including Enriched Annotations
TotalAnnotaionsVector = rep(0,ncol(DrugDiseaseGeneMatrix))
names(TotalAnnotaionsVector) = colnames(DrugDiseaseGeneMatrix)
TotalAnnotaionsVector[names(EnrichedAnnotations)] = EnrichedAnnotations
return(TotalAnnotaionsVector)
}##Function Ends
## Get GeneList for a given Diseases
GetGeneList = function(DiseaseName,DFDrgDis){
GeneList = DFDrgDis[DFDrgDis$DrugName == DiseaseName,"Symbol"]
GeneList = unlist(strsplit(GeneList,","))
GeneList = trimws(GeneList)
GeneList = unique(GeneList)
return(GeneList)
}
## Parraleize the Code
numberofCores = parallel::detectCores() - 1
### Closing all the Existing Connections
closeAllConnections()
### Making a Cluster with 8 of 8 available cores
Cluster <- parallel::makeCluster(numberofCores)
### Register the Cluster
doParallel::registerDoParallel(Cluster)
## Please download the RObject from below link
## Please download the three objects for reproducible example
## https://drive.google.com/drive/folders/0Bz9Y4BgZAF7oS2dtVVEwN0Z1Tnc?usp=sharing
GeneAnnotations = readRDS("Desktop/StackOverFlow/GeneAnnotations.rds")
DiseaseAnnotations = readRDS("Desktop/StackOverFlow/DiseaseAnnotations.rds")
Diseases = readRDS("Desktop/StackOverFlow/Diseases.rds")
## Get the Unique Names of Disease List
DisNames = row.names(DiseaseAnnotations)
## Below Function runs the code on parallel to get the Enriched Annotations for Multiple Drugs or Diseases
## Get the Enriched Annotaions for all the Diseases UP Regulated EnricR Genes
library(foreach)
EnrichedAnnotations <- foreach(i=1:length(DisNames), .export= c('GetGeneList'),.packages = "Matrix") %dopar% {
GetEnrichedAnnotations(DisNames[i],GeneAnnotations,DiseaseAnnotations,Diseases)
}
## Convert to Matrix
EnrichedAnnotations <- do.call("cbind",EnrichedAnnotations)
EnrichedAnnotations = t(EnrichedAnnotations)
colnames(EnrichedAnnotations) = colnames(EnrichedAnnotations)
rownames(EnrichedAnnotations) = DisNames
## Stop the Cluster
parallel::stopCluster(Cluster)
答案 0 :(得分:0)
如果你快速描述你的SEQUENTIAL版本,你会发现几乎所有时间都是这两行:
GenesinAnnotation = DrugDiseaseGeneMatrix[,names(UPDNAnnotations)]
GenesinAnnotation = apply(GenesinAnnotation,2,sum)
您可以预先计算colSums(这样您就不会多次计算相同的事情)并且您实际上不需要对数据进行子集化(您可以对结果进行子集化)。
所以,您只需要通过以下方式替换它们:
GenesinAnnotation <- GenesinAnnotation0[names(UPDNAnnotations)]
预先计算
GenesinAnnotation0 <- colSums(DrugDiseaseGeneMatrix)