Question

我正在使用动态编程（半全局对齐）比较大小为5500个碱基的参考序列和大小为3600的查询序列，实际上我对复杂性和性能并不了解，代码正在爆炸并给出我错误的“内存不足”。知道它在较小的序列上正常工作，我的问题是：这种行为是正常的，或者我可能在代码中有另一个问题？如果解决这个问题的任何提示是正常的吗？提前谢谢。

sub semiGlobal {
    my ( $seq1, $seq2,$MATCH,$MISMATCH,$GAP ) = @_;
    # initialization: first row to 0 ;
    my @matrix;
    $matrix[0][0]{score}   = 0;
    $matrix[0][0]{pointer} = "none";
    for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
        $matrix[0][$j]{score}   = 0;
        $matrix[0][$j]{pointer} = "none";
    }

    for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
        $matrix[$i][0]{score}   = $GAP * $i;
        $matrix[$i][0]{pointer} = "up";
    }

    # fill
    my $max_i     = 0;
    my $max_j     = 0;
    my $max_score = 0;

    print "seq2: ".length($seq2);
    print "seq1: ".length($seq1);

    for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
        for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
            my ( $diagonal_score, $left_score, $up_score );
            # calculate match score
            my $letter1 = substr( $seq1, $j - 1, 1 );
            my $letter2 = substr( $seq2, $i - 1, 1 );
            if ( $letter1 eq $letter2 ) {
                $diagonal_score = $matrix[ $i - 1 ][ $j - 1 ]{score} +  $MATCH;
            }
            else {
                $diagonal_score = $matrix[ $i - 1 ][ $j - 1 ]{score} +  $MISMATCH;
            }

            # calculate gap scores
            $up_score   = $matrix[ $i - 1 ][$j]{score} +  $GAP;
            $left_score = $matrix[$i][ $j - 1 ]{score} +  $GAP;

            # choose best score
            if ( $diagonal_score >= $up_score ) {
                if ( $diagonal_score >= $left_score ) {
                    $matrix[$i][$j]{score}   = $diagonal_score;
                    $matrix[$i][$j]{pointer} = "diagonal";
                }
                else {
                    $matrix[$i][$j]{score}   = $left_score;
                    $matrix[$i][$j]{pointer} = "left";
                }
            }
            else {
                if ( $up_score >= $left_score ) {
                    $matrix[$i][$j]{score}   = $up_score;
                    $matrix[$i][$j]{pointer} = "up";
                }
                else {
                    $matrix[$i][$j]{score}   = $left_score;
                    $matrix[$i][$j]{pointer} = "left";
                }
            }

            # set maximum score
            if ( $matrix[$i][$j]{score} > $max_score ) {
                $max_i     = $i;
                $max_j     = $j;
                $max_score = $matrix[$i][$j]{score};
            }
        }
    }

    my $align1 = "";
    my $align2 = "";
    my $j = $max_j;
    my $i = $max_i;

    while (1) {
        if ( $matrix[$i][$j]{pointer} eq "none" ) {
            $stseq1 = $j;
            last;
        }

        if ( $matrix[$i][$j]{pointer} eq "diagonal" ) {
            $align1 .= substr( $seq1, $j - 1, 1 );
            $align2 .= substr( $seq2, $i - 1, 1 );
            $i--;
            $j--;
        }
        elsif ( $matrix[$i][$j]{pointer} eq "left" ) {
            $align1 .= substr( $seq1, $j - 1, 1 );
            $align2 .= "-";
            $j--;
        }
        elsif ( $matrix[$i][$j]{pointer} eq "up" ) {
            $align1 .= "-";
            $align2 .= substr( $seq2, $i - 1, 1 );
            $i--;
        }
    }

    $align1 = reverse $align1;
    $align2 = reverse $align2;
    return ( $align1, $align2, $stseq1 ,$max_j);
}

Answer 1

可能解决问题的一种方法是将@matrix与文件绑定。但是，这将大大减慢程序的速度。考虑一下：

sub semiGlobal {

    use Tie::Array::CSV; 

    tie my @matrix, 'Tie::Array::CSV', 'temp.txt'; # Don't forget to add your own error handler.


    my ( $seq1, $seq2,$MATCH,$MISMATCH,$GAP ) = @_;

    # initialization: first row to 0 ;

    $matrix[0][0] = '0 n';
    for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
        $matrix[0][$j] = '0 n';
    }

    for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {

        my $score = $GAP * $i;
        $matrix[$i][0] = join ' ',$score,'u';
    }

    #print Dumper(\@matrix);

    # fill
    my $max_i     = 0;
    my $max_j     = 0;
    my $max_score = 0;

    print "seq2: ".length($seq2)."\n";
    print "seq1: ".length($seq1)."\n";

    for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {
        for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
            my ( $diagonal_score, $left_score, $up_score );

            # calculate match score
            my $letter1 = substr( $seq1, $j - 1, 1 );
            my $letter2 = substr( $seq2, $i - 1, 1 );
            my $score = (split / /, $matrix[ $i - 1 ][ $j - 1 ])[0];
            if ( $letter1 eq $letter2 ) {
                $diagonal_score = $score +  $MATCH;
            }
            else {
                $diagonal_score = $score +  $MISMATCH;
            }

            # calculate gap scores
            $up_score   = (split / /,$matrix[ $i - 1 ][$j])[0] +  $GAP;
            $left_score = (split / /,$matrix[$i][ $j - 1 ])[0] +  $GAP;

            # choose best score
            if ( $diagonal_score >= $up_score ) {
                if ( $diagonal_score >= $left_score ) {
                    $matrix[$i][$j] = join ' ',$diagonal_score,'d';
                }
                else {
                    $matrix[$i][$j] = join ' ', $left_score, 'l';
                }
            }
            else {
                if ( $up_score >= $left_score ) {
                    $matrix[$i][$j] = join ' ', $up_score, 'u';
                }
                else {
                    $matrix[$i][$j] = join ' ', $left_score, 'l';
                }
            }

            # set maximum score
            if ( (split / /, $matrix[$i][$j])[0] > $max_score ) {
                $max_i     = $i;
                $max_j     = $j;
                $max_score = (split / /, $matrix[$i][$j])[0];

            }
        }
    }


    my $align1 = "";
    my $align2 = "";
    my $stseq1;

    my $j = $max_j;
    my $i = $max_i;

    while (1) {
        my $pointer = (split / /, $matrix[$i][$j])[1];
        if ( $pointer eq "n" ) {
            $stseq1 = $j;
            last;
        }

        if ( $pointer eq "d" ) {
            $align1 .= substr( $seq1, $j - 1, 1 );
            $align2 .= substr( $seq2, $i - 1, 1 );
            $i--;
            $j--;
        }
        elsif ( $pointer eq "l" ) {
            $align1 .= substr( $seq1, $j - 1, 1 );
            $align2 .= "-";
            $j--;
        }
        elsif ( $pointer eq "u" ) {
            $align1 .= "-";
            $align2 .= substr( $seq2, $i - 1, 1 );
            $i--;
        }
    }

    $align1 = reverse $align1;
    $align2 = reverse $align2;

    untie @matrix; # Don't forget to add your own error handler.

    unlink 'temp.txt'; # Don't forget to add your own error handler.

    return ( $align1, $align2, $stseq1 ,$max_j);
}

您仍然可以将原始子用于短序列，并切换到此子目录以获得较长的序列。

Answer 2

我认为@j_random_hacker和@Ashalynd在大多数Perl实现中使用此算法是正确的。您正在使用的数据类型将使用更多计算所需的内存。

所以这是“正常”的，你应该期望在perl中编写这个算法时会看到这种内存使用情况。你可能在使用大量内存的周围代码中遇到其他问题但是这个算法会在大量序列的情况下严重打击你的内存。

您可以通过更改@Ashalynd建议使用的数据类型来解决一些内存问题。您可以尝试更改将分数和指针保存到数组中的哈希值，并将字符串指针更改为整数值。这样的事情可能会在保持可读性的同时为您带来一些好处：

use strict;
use warnings;

# define constants for array positions and pointer values
# so the code is still readable.
# (If you have the "Readonly" CPAN module you may want to use it for constants
# instead although none of the downsides of the "constant" pragma apply in this code.)
use constant {
  SCORE => 0,
  POINTER => 1,

  DIAGONAL => 0,
  LEFT => 1,
  UP => 2,
  NONE => 3,
};

...

sub semiGlobal2 {
  my ( $seq1, $seq2,$MATCH,$MISMATCH,$GAP ) = @_;

  # initialization: first row to 0 ;
  my @matrix;

  # score and pointer are now stored in an array
  # using the defined constants as indices
  $matrix[0][0][SCORE]   = 0;

  # pointer value is now a constant integer
  $matrix[0][0][POINTER] = NONE;

  for ( my $j = 1 ; $j <= length($seq1) ; $j++ ) {
      $matrix[0][$j][SCORE]   = 0;
      $matrix[0][$j][POINTER] = NONE;
  }

  for ( my $i = 1 ; $i <= length($seq2) ; $i++ ) {

      $matrix[$i][0][SCORE]   = $GAP * $i;
      $matrix[$i][0][POINTER] = UP;
  }

... # continue to make the appropriate changes throughout the code

然而，当我测试这个时，我尝试在5500字符串的随机数据中对齐3600字符串时没有获得巨大的好处。当我消耗超过2GB的内存时，我将代码编程为中止。原始代码在23秒后中止，而使用常量和数组而不是哈希的代码在32秒后中止。

如果您真的想使用此特定算法，我会查看Algorithm::NeedlemanWunsch的效果。它看起来不是很成熟，但可能已经解决了您的性能问题。否则，请考虑围绕C实现编写Inline或Perl XS包装器

perl通过动态编程在序列对齐中爆炸

2 个答案: