Perl Parser / Analyzer产生错误的反馈

时间:2013-01-15 13:21:11

标签: perl parsing bioinformatics dna-sequence

我遇到了问题......我对perl的了解非常有限。我的一位同事拥有必须使用以下代码运行的数据。一切正常,除了我将在下面给出的发生的序列。我希望你能理解我只能上传一小部分必须阅读的文件。也许数组x以错误的方式填充......我什么都不知道。

要阅读的数据:

>1 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
>3 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC
>5 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT
>7 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG
>9 length 64 cvg_0.0_tip_0
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTG
>11 length 64 cvg_0.0_tip_0
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTA
>13 length 64 cvg_0.0_tip_0
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTC
>15 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGA
>17 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGT
>19 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGG
>21 length 64 cvg_0.0_tip_0
CTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTG
>23 length 64 cvg_0.0_tip_0
CTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTC
>25 length 64 cvg_0.0_tip_0
AATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
>27 length 64 cvg_0.0_tip_0
AATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTA
>29 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAG
>31 length 64 cvg_0.0_tip_0
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGGA
>33 length 64 cvg_0.0_tip_0
TTGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
>35 length 64 cvg_0.0_tip_0
TTGGTGCATCTTTGTCTACATGACTCCATAACATCAAGATTGTCAAAAGCTTGCCCAAATGGTT

代码:

#!/usr/bin/perl
use strict;
use Getopt::Std qw'getopts';

my %opt;
getopts 'hHjt:', \%opt;
my $opt_threshold = defined $opt{'t'} ? $opt{'t'} : 100;
my $opt_filename = $opt{'H'} || (@ARGV > 1 && !$opt{'h'});
my $opt_jira = $opt{'j'};

sub eng($)
{
    my $x = shift;
    return $x if $x < 10000000;
    return substr($x / 1000000, 0, 5) . 'e6';
}

my ($short, $sum, $ambiguous, $any, $other);
my @x;

sub count($$)
{
    my $id = shift;
    my $seq = uc shift;
    #print $id, length $seq;
    #print "\n";
    my $x = $seq =~ tr/ACGT//;
    my $colourspace = $seq =~ tr/0123//;
    die unless $x == 0 || $colourspace == 0;
    $x = $colourspace if $x == 0;
    my $myambiguous = $seq =~ tr/KYSBWRDMHV//;
    my $myany = $seq =~ tr/N//;
    if ($x < $opt_threshold) {
        $short++;
        return;
    }
    $sum += $x;
    $ambiguous += $myambiguous;
    $any += $myany;
    $other += (length $seq) - $x - $myambiguous - $myany;
    push @x, $x;
}

sub fac($)
{
    my $path = shift;
    $short = $sum = 0;
    $ambiguous = $any = $other = 0;
    @x = ();

    my $id;
    my $seq;
    open IN, "<$path" or die "$path: $!\n";
    while (<IN>) {
        chomp;
        if (/^>/) {
            count $id, $seq if defined $id;
            $id = $_;
            $seq = '';
        } else {
            $seq .= $_;
            push @x, $seq;
        }
    }
    count $id, $seq if defined $id;
    close IN;

    my $n = @x;
    if ($n > 0) {
        my $mean = int $sum / $n;

        @x = sort { $a <=> $b } @x;
        my $min = $x[0];
        #my $q1 = $x[@x/4];
        my $q2 = $x[@x/2];
        #my $q3 = $x[@x*3/4];
        my $max = $x[-1];

        my ($nn50, $n50);
        my $n50sum = 0;
        while ($n50sum < $sum / 2) {
            $nn50++;
            $n50 = pop @x;
            $n50sum += $n50;
        }
        #my $np = int 100 * $n50sum / $sum;

        my $ntotal = eng $short + $n;
        my $sumeng = eng $sum;
print "$n50\t$path\n";
=pod
        print "$ntotal$," if $opt_threshold > 0;
        print $n, $nn50, $min, $q2, $mean, $n50, $max;
        #printf "$,%#.3g", $sum;
        print $, . eng($sum);
        print "$,ambig=$ambiguous" if $ambiguous > 0;
        print "$,any=$any" if $any > 0;
        print "$,other=$other" if $other > 0;
        print "$,$path" if $opt_filename;
        print "\n";
=cut


        format Spaces =
@<<<<<<<@<<<<<<@<<<<<<@<<<<<<@<<<<<<@<<<<<<@<<<<<<@<<<<<<@<<<<<< ^<<<<<<<<<<<<<
$ntotal, $n, $nn50, $min, $q2, $mean, $n50, $max, $sumeng, $path
                                                                 ^<<<<<<<<<<<<<~~
$path
.
        format Pipes =
|@<<<<<<<|@<<<<<|@<<<<<|@<<<<<|@<<<<<|@<<<<<|@<<<<<|@<<<<<|@<<<<<<|@*|
$ntotal, $n, $nn50, $min, $q2, $mean, $n50, $max, $sumeng, $path
.
        $~ = $opt_jira ? 'Pipes' : 'Spaces';
        $^ = $opt_jira ? 'Pipes_TOP' : 'Spaces_TOP';
        $: = '/- ';
        write;
=pod
        print "$,ambig=$ambiguous" if $ambiguous > 0;
        print "$,any=$any" if $any > 0;
        print "$,other=$other" if $other > 0;
        print "$,$path" if $opt_filename;
        print "\n";
=cut
    } else {
        print STDERR "warning: `$path' is empty\n";
    }
}

format Spaces_TOP =
n       n:@<<< n:N50  min    median   mean   N50    max    sum
$opt_threshold
.
format Pipes_TOP =
||n     ||n:@<<||n:N50||min  ||median  ||mean||N50  ||max  ||sum   ||
$opt_threshold
.

=pod
$, = "\t";
if ($opt_threshold > 0) {
    print 'n', "n:$opt_threshold$,";
} else {
    print "n$,";
}
print 'n:N50', 'min', 'median', 'mean', 'N50', 'max', "sum\n";
=cut

@ARGV = ('-') if @ARGV == 0;
fac $_ foreach @ARGV;

这就是我得到的:

n       n:100  n:N50  min    median   mean   N50    max    sum 
204739  200636 253    AAAAAAACCTGCTG93     22803  138379 18.78e6

这应该是什么(某种程度上):

n       n:100  n:N50  min    median mean   N50    max    sum 
36050   8847   1692   100    463    691    1119   6260   6121464  k49_q25.contig

0 个答案:

没有答案