我正在计算序列的对数奇数分数,并返回给出最大分数的基序(序列的小部分)。我有代码可以计算文件中每个序列的最高分,并且在存储给出该分的主题时遇到了麻烦。有关文件格式,对数得分的一般计算等,请参见我的其他文章。Perl: Creating and manipulating hash of arrays for log-odds scores of DNA sequences。我的代码如下:

use strict;
use warnings;
use List::Util 'max';
use Data::Dumper; 

#User specifies motif width
my $width = 3;

#User enters the filename that contains the sequence data
print "Please enter the filename of the fasta sequence data: ";
my $filename1 = <STDIN>;

#Remove newline from file
chomp $filename1;

#Open the file and store each dna seq in hash
my %id2seq = ();
my %HoA = ();
my %loscore = ();
my %maxscore = ();
my %maxmot = ();
my $id = '';
open (FILE, '<', $filename1) or die "Cannot open $filename1.",$!;
my $dna;
while (<FILE>)
    if($_ =~ /^>(.+)/)
        $id = $1; #Stores 'Sequence 1' as the first $id, for example
        $HoA{$id} = [ split(//) ]; #Splits the contents to allow for position reference later
        $id2seq{$id} .= $_; #Creates a hash with each seq associated to an id number
        $maxmot{$id} = (); #Creates empty hash to push motifs to
        foreach $id (keys %HoA)
            for my $len (0..(length($HoA{$id})-$width-1))
                push @{ $loscore{$id} }, 0;
        push @{ $maxscore{$id} }, -30; #Creates a HoA with each id number to have a maxscore (initial score -30)
close FILE;

foreach $id (keys %id2seq)
    print "$id2seq{$id}\n\n";
print "\n\n";

#Create log-odds table of motifs
my %logodds;
$logodds{'A'}[0] = 0.1;
$logodds{'A'}[1] = 0.2;
$logodds{'A'}[2] = 0.3;
$logodds{'C'}[0] = 0.2;
$logodds{'C'}[1] = 0.5;
$logodds{'C'}[2] = 0.2;
$logodds{'G'}[0] = 0.3;
$logodds{'G'}[1] = 0.2;
$logodds{'G'}[2] = 0.4;
$logodds{'T'}[0] = 0.4;
$logodds{'T'}[1] = 0.1;
$logodds{'T'}[2] = 0.1;

#Determine location for each sequence that maximally
#aligns to the motif pattern

foreach $id (keys %HoA)
    for my $pos1 (0..length($HoA{$id})-$width-1)    #Look through all positions the motif can start at
        for my $pos2 ($pos1..$pos1+($width-1))  #Define the positions within the motif (0 to width-1)
            for my $base (qw( A C G T))
                if ($HoA{$id}[$pos2] eq $base)  #If the character matches a base:
                    for my $pos3 (0..$width-1)  #Used for position reference in %logodds
                        #Calculate the log-odds score at each location
                        $loscore{$id}[$pos2] += $logodds{$base}[$pos3];

                        #Calculate the maximum log-odds score for each sequence

                        #Find the motif that gives the maximum score for each sequence
                        $maxscore{$id} = max( @{ $loscore{$id} });
                        if ($loscore{$id}[$pos2] == $maxscore{$id})
                            push @{ $maxmot{$id} }, $HoA{$id}[$pos3]; #NOT SURE THAT THIS IS THE CORRECT WAY TO PUSH WHAT I WANT

print "\n\n";
print Dumper(\%maxmot);


'Sequence 11' => [ 'C', 'A', 'T'], 'Sequence 14' => ['C', 'T', 'G'], etc.

由于$width = 3,预期输出中应该有三个基数。我得到的输出为我提供了每个底数的倍数,而不是任何明显的顺序(或者至少我不能注意到一个模式):

'Sequence 11' => [ 'C', 'C', 'C', 'A', 'A', 'A', 'A', 'T', 'A', 'A', 'T', 'T', 'T'], 'Sequence 14' => ['C', 'C', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'T', 'T'], etc. 我认为该问题根源于push @{ $maxmot{$id} }, $HoA{$id}[$pos3];步骤,但我不确定如何解决。我尝试改用$HoA{$id}[$pos2],但这似乎也不能解决我的问题。与往常一样,任何帮助都将受到赞赏!我可以澄清是否需要,我知道我的问题有些复杂。预先谢谢你。

push()不是问题。通过运行代码,很明显,有条件的$loscore{$id}[$pos2] == $maxscore{$id}比您期望的多true


  • 为什么在数组上使用length()?它不返回数组的长度。
  • for my $base (qw( A C G T)) { if ($HoA{$id}[$pos2] eq $base) {...只是等效的my $base = $HoA{$id}[$pos2];的一种无效方式吗?
  • 每个$pos2的计算被执行$pos2 + 1次,即0一次,1两次,...即以后的位置得分更高。 / li>
  • $loscore{$id}[$pos2]的一次计算是@{ $logodds{$base} }的总和,即该计算将忽略位置$pos2 + $pos3的基数
  • 您正在运行序列时重新计算$maxscore{$id},然后在条件条件中使用变化的值
  • (我猜)主题应该是$width个碱基长,但是您的代码仅将单个碱基存储到%maxmot



use strict;
use warnings;

use List::Util 'max';
use Data::Dumper;

my $width = 3;

my %HoA;
my %maxpos;
my %loscore;
my $id = '';
while (<DATA>) {
    if (/^>(.+)/) {
        $id = $1;
    } else {
        $HoA{$id}     = [ split(//) ];
        $maxpos{$id}  = @{ $HoA{$id} } - $width - 1;
        $loscore{$id} = [ (0) x ($maxpos{$id} + 1) ];

my %logodds = (
    A => [0.1, 0.2, 0.3],
    C => [0.2, 0.5, 0.2],
    G => [0.3, 0.2, 0.4],
    T => [0.4, 0.1, 0.1],

my %maxscore;
my %maxmot;

# Calculate the log-odds score at each location
foreach $id (keys %HoA) {
    for my $index (0..$maxpos{$id}) {
        for my $offset (0..$width-1) {
            # look at base in sequence $id at $offset after $index
            my $base = $HoA{$id}[$index + $offset];
            $loscore{$id}[$index] += $logodds{$base}[$offset];

# Calculate the maximum log-odds score for each sequence
foreach $id (keys %HoA) {
    $maxscore{$id} = max( @{ $loscore{$id} });

# Find the motif that gives the maximum score for each sequence
foreach $id (keys %HoA) {
    for my $index (0..$maxpos{$id}) {
        if ($loscore{$id}[$index] == $maxscore{$id}) {
            # motif of length $width
            my $motif = join('', @{ $HoA{$id} }[$index..$index + $width - 1]);

print Data::Dumper->Dump([\%loscore, \%maxscore, \%maxmot],
                         [qw(*loscore *maxscore *maxmot)]);

exit 0;



$ perl
%loscore = (
             'Sequence_1' => [
             'Sequence_2' => [
             'Sequence_3' => [
%maxscore = (
              'Sequence_1' => '1.2',
              'Sequence_3' => '1.3',
              'Sequence_2' => '1.2'
%maxmot = (
            'Sequence_3' => {
                              'TCG' => 1
            'Sequence_2' => {
                              'TCA' => 1
            'Sequence_1' => {
                              'TCA' => 2



use strict;
use warnings;

use Data::Dumper;

my $width = 3;

my %logodds = (
    A => [0.1, 0.2, 0.3],
    C => [0.2, 0.5, 0.2],
    G => [0.3, 0.2, 0.4],
    T => [0.4, 0.1, 0.1],

# calculate log score for each motif combination
my $motif_score = {'' => 0}; # start with a 0-length motif
foreach my $offset (0..$width - 1) {
    my %scores;

    # for all motifs...
    foreach my $prefix (keys %{ $motif_score }) {
        my $base_score = $motif_score->{$prefix};

        # ... add another base to the motif
        for my $base (qw(A G C T)) {
            $scores{"${prefix}${base}"} = $base_score + $logodds{$base}[$offset];

    # store the scores for the new sequences
    $motif_score = \%scores;

#print Data::Dumper->Dump([$motif_score], [qw(motif_score)]);

my $id;
my %maxmot;
while (<DATA>) {
    if (/^>(.+)/) {
        $id = $1;
    } else {
        chomp(my $sequence = $_);
        my $max = -1;

        # run a window of length $width over the sequence
        foreach my $index (0..length($sequence) - $width - 1) {

            # extract the motif at $index from sequence
            my $motif = substr($sequence, $index, $width);
            my $score = $motif_score->{$motif};

            # update max score if the motif has a higher score
            if ($score > $max) {
                $max         = $score;
                $maxmot{$id} = $motif;

print Data::Dumper->Dump([\%maxmot], [qw(*maxmot)]);

exit 0;



$ perl
%maxmot = (
            'Sequence_2' => 'TCA',
            'Sequence_3' => 'TCG',
            'Sequence_1' => 'TCA'


use warnings;
use strict;

use List::Util qw(first pairs);
use Data::Dumper;

my $width = 3;

my %logodds = (
    A => [0.1, 0.2, 0.3],
    C => [0.2, 0.5, 0.2],
    G => [0.3, 0.2, 0.4],
    T => [0.4, 0.1, 0.1],
my @bases = keys %logodds;

# calculate log score for each motif combination
my $motif_logscore = {'' => 0}; # start with a 0-length motif
foreach my $offset (0..$width - 1) {
    my %score;

    # for all motifs...
    foreach my $prefix (keys %{ $motif_logscore }) {
        my $base_score = $motif_logscore->{$prefix};

        # ... add another base to the motif
        for my $base (@bases) {
            $score{"${prefix}${base}"} = $base_score + $logodds{$base}[$offset];

    # update hash ref to new motif scores
    $motif_logscore = \%score;

#print Data::Dumper->Dump([$motif_logscore], [qw(motif_logscore)]);

my @motifs_sorted =
    # list of [<motif>, <regular expression>] array refs
    map    { [$_->[0], qr/$_->[0]/] }
    # sort in descending numerical score order
    sort   { $b->[1] cmp $a->[1] }
    # list of [<motif>, <score>] array refs
    pairs %{ $motif_logscore };

#print Data::Dumper->Dump([\@motifs_sorted], [qw(*motifs_sorted)]);

my $id;
my %maxmot;
while (<DATA>) {
    if (/^>(.+)/) {
        $id = $1;
    } else {
        my $sequence = $_;
        # find the first pair where the regex matches -> store motif
        $maxmot{$id} = (
            first { ($sequence =~ $_->[1])[0] } @motifs_sorted

print Data::Dumper->Dump([\%maxmot], [qw(*maxmot)]);

exit 0;


undef $maxmot{$id}{ $HoA{$id}[$pos3] };
