我正在尝试使用Smith–Waterman algorithm在Python中实现本地序列比对。
这是我到目前为止所拥有的。它构建了similarity matrix:
import sys, string
from numpy import *
f1=open(sys.argv[1], 'r')
seq1=f1.readline()
f1.close()
seq1=string.strip(seq1)
f2=open(sys.argv[2], 'r')
seq2=f2.readline()
f2.close()
seq2=string.strip(seq2)
a,b =len(seq1),len(seq2)
penalty=-1;
point=2;
#generation of matrix for local alignment
p=zeros((a+1,b+1))
# table calculation and matrix generation
for i in range(1,a+1):
for j in range(1,b+1):
vertical_score =p[i-1][j]+penalty;
horizontal_score= p[i][j-1]+penalty;
if seq1[i-1]==seq2[j-1]:
diagonal_score =p[i-1][j-1]+point;
else:
diagonal_score = p[i-1][j-1]+penalty;
p[i][j]=max(0,vertical_score,horizontal_score,diagonal_score);
print p
例如,使用两个序列:
agcacact
acacacta
我的代码输出相似度矩阵:
[[ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 2. 1. 2. 1. 2. 1. 0. 2.]
[ 0. 1. 1. 1. 1. 1. 1. 0. 1.]
[ 0. 0. 3. 2. 3. 2. 3. 2. 1.]
[ 0. 2. 2. 5. 4. 5. 4. 3. 4.]
[ 0. 1. 4. 4. 7. 6. 7. 6. 5.]
[ 0. 2. 3. 6. 6. 9. 8. 7. 8.]
[ 0. 1. 4. 5. 8. 8. 11. 10. 9.]
[ 0. 0. 3. 4. 7. 7. 10. 13. 12.]]
现在我陷入了算法的下一步,即回溯以构建最佳对齐。
为了获得最佳的局部对齐,我们从矩阵中的最高值开始( i , j )。然后,我们回到其中一个位置( i - 1, j ),( i , j - 1)和( i - 1, j - 1)取决于用于构造矩阵的运动方向。我们保持这个过程,直到我们到达一个零值的矩阵单元格或位置(0,0)的值。
我无法确定要回溯到哪个位置。维基百科的意思是“取决于用于构造矩阵的运动方向”意味着我将如何在Python中实现这一点?
最后我做了这个
import sys, string
from numpy import*
import re
# read first sequence
fasta_sequence1=open(sys.argv[1], 'r')
seq1=""
for i in fasta_sequence1:
if i.startswith(">"):
pass
else:
seq1 = seq1 + i.strip()
fasta_sequence1.close()
fasta_sequence2=open(sys.argv[2], 'r')
seq2 = ""
for i in fasta_sequence2:
if i.startswith('>'):
pass
else:
seq2 = seq2+ i.strip()
fasta_sequence2.close()
a,b =len(seq1),len(seq2)
penalty=-1;
point=2;
#generation of matrix for local alignment
p=zeros((a+1,b+1))
#intialization of max score
max_score=0;
#pointer to store the traceback path
pointer=zeros((a+1,b+1))
# table calculation and matrix generation
for i in range(1,a+1):
for j in range(1,b+1):
vertical_score =p[i-1][j]+penalty;
horizontal_score= p[i][j-1]+penalty;
if seq1[i-1]==seq2[j-1]:
diagonal_score =p[i-1][j-1]+point;
else:
diagonal_score = p[i-1][j-1]+penalty;
for i in range(1,a+1):
for j in range(1,b+1):
p[i][j]=max(0,vertical_score,horizontal_score,diagonal_score);
for i in range(1,a+1):
for j in range(1,b+1):
if p[i][j]==0:
pointer[i][j]=0; #0 means end of the path
if p[i][j]==vertical_score:
pointer[i][j]=1; #1 means trace up
if p[i][j]==horizontal_score:
pointer[i][j]=2; #2 means trace left
if p[i][j]==diagonal_score:
pointer[i][j]=3; #3 means trace diagonal
if p[i][j]>=max_score:
maximum_i=i;
maximum_j=j;
max_score=p[i][j];
#for i in range(1,a+1):
# for j in range(1,b+1):
#if p[i][j]>max_score:
#max_score=p[i][j]
print max_score
# traceback of all the pointers
align1,align2='',''; #initial sequences
i,j=max_i,max_j; #indices of path starting point
while pointer[i][j]!=0:
if pointer[i][j]==3:
align1=align1+seq1[i-1];
align2=align2+seq2[j-1];
i=i-1;
j=j-1;
elif pointer[i][j]==2:
align1=align1+'-';
align2=align2+seq2[j-1]
j=j-1;
elif pointer[i][j]==1:
align1=align1+seq1[i-1];
align2=align2+'-';
i=i-1;
align1=align1[::-1]; #reverse sequence 1
align2=align2[::-1]; #reverse sequence 2
#output_file = open(sys.argv[3],"w")
#output_file.write(align1)
#output_file.write(align2)
print align1
print align2
但我认为可以采用更好更有效的方式来实现这一目标
output_file = open(sys.argv[3],"w")
output_file.write(align1)
output_file.write(align2)
结果显示为
agcacacta-cacact
相反:
print align1
print align2
显示正确的输出:
agcacact
a-cacact
如何在文件编写器中获得上述输出。感谢
答案 0 :(得分:11)
构建相似度矩阵时,您不仅需要存储相似度得分,还要存储得分来自的位置。您目前有一行代码:
p[i][j]=max(0,vertical_score,horizontal_score,diagonal_score);
所以在这里你需要记住的不是最高分,而是这些中的哪一个是最大分数。然后,当你来回溯时,你会知道要走哪条路。
例如,您可以尝试这样的事情:
import numpy
DELETION, INSERTION, MATCH = range(3)
def smith_waterman(seq1, seq2, insertion_penalty = -1, deletion_penalty = -1,
mismatch_penalty = -1, match_score = 2):
"""
Find the optimum local sequence alignment for the sequences `seq1`
and `seq2` using the Smith-Waterman algorithm. Optional keyword
arguments give the gap-scoring scheme:
`insertion_penalty` penalty for an insertion (default: -1)
`deletion_penalty` penalty for a deletion (default: -1)
`mismatch_penalty` penalty for a mismatch (default: -1)
`match_score` score for a match (default: 2)
See <http://en.wikipedia.org/wiki/Smith-Waterman_algorithm>.
>>> for s in smith_waterman('AGCAGACT', 'ACACACTA'): print s
...
AGCAGACT-
A-CACACTA
"""
m, n = len(seq1), len(seq2)
# Construct the similarity matrix in p[i][j], and remember how
# we constructed it -- insertion, deletion or (mis)match -- in
# q[i][j].
p = numpy.zeros((m + 1, n + 1))
q = numpy.zeros((m + 1, n + 1))
for i in range(1, m + 1):
for j in range(1, n + 1):
deletion = (p[i - 1][j] + deletion_penalty, DELETION)
insertion = (p[i][j - 1] + insertion_penalty, INSERTION)
if seq1[i - 1] == seq2[j - 1]:
match = (p[i - 1][j - 1] + match_score, MATCH)
else:
match = (p[i - 1][j - 1] + mismatch_penalty, MATCH)
p[i][j], q[i][j] = max((0, 0), deletion, insertion, match)
# Yield the aligned sequences one character at a time in reverse
# order.
def backtrack():
i, j = m, n
while i > 0 or j > 0:
assert i >= 0 and j >= 0
if q[i][j] == MATCH:
i -= 1
j -= 1
yield seq1[i], seq2[j]
elif q[i][j] == INSERTION:
j -= 1
yield '-', seq2[j]
elif q[i][j] == DELETION:
i -= 1
yield seq1[i], '-'
else:
assert(False)
return [''.join(reversed(s)) for s in zip(*backtrack())]