Needleman-Wunsch全局对齐中的回溯,无需存储指针

时间:2013-11-24 00:29:42

标签: python dynamic-programming

我的理解是,虽然我可以找到动态编程的基本上每个讨论都有一个存储指针,因为填充了矩阵,而不是在回溯步骤中重新计算先前的单元格。

据我所知,我有我的动态编程算法来正确构建矩阵,但我对如何进行回溯计算感到困惑。我也被告知有必要重新计算这些值(而不仅仅是查找它们),但我不知道它会如何产生不同的数字。

我正在实施的SW版本包括两个序列中的间隙打开选项,因此每个矩阵的递归关系有三个选项。下面是我的全局对齐类的当前版本。根据我的手计算,我相信score_align正确生成矩阵,但显然traceback_col_seq不起作用。

INF = 2147483647 #max size of int32
class global_aligner():
    def __init__(self, subst, open=10, extend=2, double=3):
        self.extend, self.open, self.double, self.subst = extend, open, double, subst
    def __call__(self, row_seq, col_seq):
        #add alphabet error checking?
        score_align(row_seq, col_seq)
        return traceback_col_seq()
    def init_array(self):
        self.M = zeros((self.maxI, self.maxJ), int)
        self.Ic = zeros((self.maxI, self.maxJ), int)
        self.Ir = zeros((self.maxI, self.maxJ), int)
        for i in xrange(self.maxI):
            self.M[i][0], self.Ir[i][0], self.Ic[i][0] = \
                    -INF, -INF, -(self.open+self.extend*i)
        for j in xrange(self.maxJ):
            self.M[0][j], self.Ic[0][j], self.Ir[0][j] = \
                    -INF, -INF, -(self.open+self.extend*j)
        self.M[0][0] = 0
        self.Ic[0][0] = -self.open
    def score_cell(self, i, j, chars):
        thisM = [self.Ic[i-1][j-1]+self.subst[chars], self.M[i-1][j-1]+\
                        self.subst[chars], self.Ir[i-1][j-1]+self.subst[chars]]
        thisC = [self.Ic[i][j-1]-self.extend, self.M[i][j-1]-self.open, \
                        self.Ir[i][j-1]-self.double]
        thisR = [self.M[i-1][j]-self.open, self.Ir[i-1][j]-self.extend, \
                        self.Ic[i-1][j]-self.double]
        return max(thisM), max(thisC), max(thisR)
    def score_align(self, row_seq, col_seq):
        self.row_seq, self.col_seq = list(row_seq), list(col_seq)
        self.maxI, self.maxJ = len(self.row_seq)+1, len(self.col_seq)+1
        self.init_array()
        for i in xrange(1, self.maxI):
            row_char = self.row_seq[i-1]
            for j in xrange(1, self.maxJ):
                chars = row_char+self.col_seq[j-1]
                self.M[i][j], self.Ic[i][j], self.Ir[i][j] = \
                        self.score_cell(i, j, chars)
    def traceback_col_seq(self):
        self.traceback = list()
        i, j = self.maxI-1, self.maxJ-1
        while i > 1 and j > 1:
            cell = [self.M[i][j], self.Ic[i][j], self.Ir[i][j]]
            cellMax = max(cell)
            chars = self.row_seq[i-1]+self.col_seq[j-1]
            if cell.index(cellMax) == 0: #M
                diag = [diagM, diagC, diagR] = self.score_cell(i-1, j-1, chars)
                diagMax = max(diag)
                if diag.index(diagMax) == 0: #match
                    self.traceback.append(self.col_seq[j-1])
                elif diag.index(diagMax) == 1: #insert column (open)
                    self.traceback.append('-')
                elif diag.index(diagMax) == 2: #insert row (open other)
                    self.traceback.append(self.col_seq[j-1].lower())
                i, j = i-1, j-1
            elif cell.index(cellMax) == 1: #Ic
                up = [upM, upC, upR] = self.score_cell(i-1, j, chars)
                upMax = max(up)
                if up.index(upMax) == 0: #match (close)
                    self.traceback.append(self.col_seq[j-1])
                elif up.index(upMax) == 1: #insert column (extend)
                    self.traceback.append('-')
                elif up.index(upMax) == 2: #insert row (double)
                    self.traceback.append('-')
                i -= 1
            elif cell.index(cellMax) == 2: #Ir
                left = [leftM, leftC, leftR] = self.score_cell(i, j-1, chars)
                leftMax = max(left)
                if left.index(leftMax) == 0: #match (close)
                    self.traceback.append(self.col_seq[j-1])
                elif left.index(leftMax) == 1: #insert column (double)
                    self.traceback.append('-')
                elif left.index(leftMax) == 2: #insert row (extend other)
                    self.traceback.append(self.col_seq[j-1].lower())
                j -= 1
        for j in xrange(0,j,-1):
            self.traceback.append(self.col_seq[j-1])
        for i in xrange(0,i, -1):
            self.traceback.append('-')
        return ''.join(self.traceback[::-1])    


test = global_aligner(blosumMatrix)
test.score_align('AA','AAA')
test.traceback_col_seq()

1 个答案:

答案 0 :(得分:1)

我认为主要的问题是,在生成可能来自的单元格时,您并未考虑当前所使用的矩阵。 cell = [self.M[i][j], self.Ic[i][j], self.Ir[i][j]]第一次通过while循环是正确的,但之后你不能只选择得分最高的矩阵。您的选择受到您来自哪里的限制。我在跟踪你的代码时遇到了一些麻烦,但我认为你在while循环的if语句中考虑到了这一点。如果是这种情况,那么我认为按照这些方面的改变就足够了:

 cell = [self.M[i][j], self.Ic[i][j], self.Ir[i][j]]
 cellIndex = cell.index(max(cell))
 while i > 1 and j > 1:
      chars = self.row_seq[i-1]+self.col_seq[j-1]
      if cellIndex == 0: #M
            diag = [diagM, diagC, diagR] = self.score_cell(i-1, j-1, chars)
            diagMax = max(diag)
            ...
            cellIndex = diagMax
            i, j = i-1, j-1
        elif cell.index(cellMax) == 1: #Ic
            up = [upM, upC, upR] = self.score_cell(i-1, j, chars)
            upMax = max(up)
            ...
            cellIndex = upMax
            i -= 1
        elif cell.index(cellMax) == 2: #Ir
            left = [leftM, leftC, leftR] = self.score_cell(i, j-1, chars)
            leftMax = max(left)
            ...
            cellIndex = leftMax
            j -= 1

就像我说的那样,我并不认为我正确地遵循你的代码,但看看是否有帮助。