Question

我正尝试使用 R 中的 phyper 功能执行浓缩分析。我写的代码给了我准确的结果，但是当矩阵的大小增加时它需要永远。以下是621 * 1860矩阵的可重现示例。但是当矩阵的大小增加到6210 X 24000时，即使我在多个核心上运行它，我也需要将近一天的时间才能完成。我想知道是否有办法优化相同的。

请从评论中的可共享链接下载三个 RObject 。

## Main Functions
GetEnrichedAnnotations <- function(DrugDiseaseName,
                               DrugDiseaseGeneMatrix,
                               DrugDiseaseFeatureMatrix,
                               DFDrgDis){ ## Function Begins
  TotalGenesCount = nrow(DrugDiseaseGeneMatrix)
  ## Get the assosciated Genes for each Drug or Disease
  DrugDiseaseGenes = GetGeneList(DrugDiseaseName,DFDrgDis)
  ## Get the only annotations that Genes from the Drug or Disease List 
  UPDNAnnotations = DrugDiseaseFeatureMatrix[DrugDiseaseName,]
  UPDNAnnotations = UPDNAnnotations[UPDNAnnotations > 0]
  ## First value to the HyperGeometricFunction phyper
  GenesFromInput = DrugDiseaseFeatureMatrix[DrugDiseaseName,names(UPDNAnnotations)]
  ## Second value to the HyperGeometricFunction phyper
  GenesinAnnotation = DrugDiseaseGeneMatrix[,names(UPDNAnnotations)]
  GenesinAnnotation = apply(GenesinAnnotation,2,sum)
  ## Third Value  to the HyperGeometricFunction phyper
  TotalGenes = rep(TotalGenesCount,length(GenesFromInput))
  RemainingGenes = TotalGenes - GenesinAnnotation
  ## Fourth value to the HyperGeometricFunction phyper
  NumberOfGenesInDrug = rep(length(DrugDiseaseGenes),length(GenesFromInput))
  names(NumberOfGenesInDrug) = names(GenesFromInput)
  ## Apply Enrichment ANalysis
  PValues = phyper(GenesFromInput-1,GenesinAnnotation,RemainingGenes,NumberOfGenesInDrug,lower.tail = FALSE)
  AdjustedPvalues = p.adjust(PValues,method = "BH")
  EnrichedAnnotations = AdjustedPvalues[AdjustedPvalues <= 0.05]
  ### When P value is zero, replacing zeros with the minimum value
  EnrichedAnnotations[EnrichedAnnotations == 0] = 2.2e-16
  EnrichedAnnotations = EnrichedAnnotations[EnrichedAnnotations <= 0.05]
  ## Get the log value for the adjusted Pvalues
  EnrichedAnnotations = -log(EnrichedAnnotations,2)
  ## This vector consists of all the annotations including Enriched Annotations
  TotalAnnotaionsVector = rep(0,ncol(DrugDiseaseGeneMatrix))
  names(TotalAnnotaionsVector) = colnames(DrugDiseaseGeneMatrix)
  TotalAnnotaionsVector[names(EnrichedAnnotations)] = EnrichedAnnotations
  return(TotalAnnotaionsVector)
  }##Function Ends


## Get GeneList for a given Diseases
GetGeneList = function(DiseaseName,DFDrgDis){
  GeneList = DFDrgDis[DFDrgDis$DrugName == DiseaseName,"Symbol"]
  GeneList = unlist(strsplit(GeneList,","))
  GeneList = trimws(GeneList)
  GeneList = unique(GeneList)
  return(GeneList)
}


## Parraleize the Code
numberofCores = parallel::detectCores() - 1
### Closing all the Existing Connections
closeAllConnections()
### Making a Cluster  with 8 of 8 available cores
Cluster <- parallel::makeCluster(numberofCores)
### Register the Cluster
doParallel::registerDoParallel(Cluster)





## Please download the RObject from below link
## Please download the three objects for reproducible example
## https://drive.google.com/drive/folders/0Bz9Y4BgZAF7oS2dtVVEwN0Z1Tnc?usp=sharing

GeneAnnotations = readRDS("Desktop/StackOverFlow/GeneAnnotations.rds")
DiseaseAnnotations =  readRDS("Desktop/StackOverFlow/DiseaseAnnotations.rds") 
Diseases =  readRDS("Desktop/StackOverFlow/Diseases.rds") 
## Get the Unique Names of Disease List
DisNames = row.names(DiseaseAnnotations)

## Below Function runs the code on parallel to get the Enriched Annotations for Multiple Drugs or Diseases
## Get the Enriched Annotaions for all the Diseases UP Regulated EnricR Genes
library(foreach)
EnrichedAnnotations <- foreach(i=1:length(DisNames), .export= c('GetGeneList'),.packages = "Matrix") %dopar% {
  GetEnrichedAnnotations(DisNames[i],GeneAnnotations,DiseaseAnnotations,Diseases)
}

## Convert to Matrix
EnrichedAnnotations <- do.call("cbind",EnrichedAnnotations)
EnrichedAnnotations = t(EnrichedAnnotations)

colnames(EnrichedAnnotations) = colnames(EnrichedAnnotations)
rownames(EnrichedAnnotations) = DisNames


## Stop the Cluster
parallel::stopCluster(Cluster)

Answer 1

如果你快速描述你的SEQUENTIAL版本，你会发现几乎所有时间都是这两行：

GenesinAnnotation = DrugDiseaseGeneMatrix[,names(UPDNAnnotations)]
GenesinAnnotation = apply(GenesinAnnotation,2,sum)

您可以预先计算colSums（这样您就不会多次计算相同的事情）并且您实际上不需要对数据进行子集化（您可以对结果进行子集化）。

所以，您只需要通过以下方式替换它们：

GenesinAnnotation <- GenesinAnnotation0[names(UPDNAnnotations)]

预先计算

GenesinAnnotation0 <- colSums(DrugDiseaseGeneMatrix)

优化的方式进行浓缩分析

1 个答案: