这是一个非常简单的带有R的自对齐矩阵的生物信息学实现。它使用滑动窗口运算符frag1
在字符串序列上循环两次,并与同一序列的每个fra2
进行比较。
下面的代码非常慢,不知道如何使用标准R语法加快速度。在python中这将是超快的,但在R中它需要1分钟。通过同时指定i,j
和j,i
,我已经将计算减少了一半。
任何加速想法?
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
windowSize<-24;
matrixSize<-nchar(sequence) - windowSize;
defaultValue = -10000000000;
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize);
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize);
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize);
totalScore = 0;
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize;
}
scoreMatrix[i,j] = totalScore;
scoreMatrix[j,i] = totalScore;
}
}
}
return(scoreMatrix);
答案 0 :(得分:2)
你在我不那么新的笔记本电脑上的原始代码(2014年的联想瑜伽2,R3.4)在17秒内运行。经过不那么重的优化后,这段时间缩短为2秒。我刚刚在计算开始时将sequence
转换为向量。之后,我按BLOSUM50
中的名称更改了索引,以便按数字索引编制索引。它导致0.5秒的执行时间。请参阅下面的代码和基准。
fun = function(sequence){
windowSize<-24
matrixSize<-nchar(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize)
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize)
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun2 = function(sequence){
windowSize<-24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize<-length(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = sequence[i:(i+windowSize)]
for(j in 1:matrixSize){
frag2 = sequence[j:(j+windowSize)]
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[frag1[x],frag2[x]] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun3 = function(sequence){
windowSize = 24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize = length(sequence) - windowSize
scoreMatrix = matrix(NA, nrow = matrixSize, ncol = matrixSize)
sequence_index = match(sequence, colnames(BLOSUM50))
for(i in seq_len(matrixSize)){
frag1 = sequence_index[i:(i+windowSize - 1)]
for(j in seq_len(matrixSize)){
frag2 = sequence_index[j:(j+windowSize - 1)]
if(is.na(scoreMatrix[i,j])){
totalScore = sum(BLOSUM50[(frag2 - 1)*NROW(BLOSUM50) + frag1])/windowSize
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
library(microbenchmark)
microbenchmark(
original = fun(sequence),
fun2 = fun2(sequence),
fun3 = fun3(sequence),
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# original 16395.2108 16660.3295 17533.8563 16755.9680 17594.3596 20263.4137 5
# fun2 1992.7731 2010.4031 2027.7953 2015.9592 2034.9022 2084.9390 5
# fun3 472.0641 481.9267 496.2656 498.3259 506.6357 522.3755 5