得到'内存不足!'非常复杂的Perl脚本中的错误

时间:2013-05-16 23:17:52

标签: perl

我刚才在Ask Ubuntu问this question,并被指示在这里提供一些代码,因为它可能是一个优化问题。我已经包含了整个脚本。

总体目标是在目录中浏览大约7000个HTML文件并从中解析特定信息,并将其作为一行导出到文本文件中。

#!/usr/bin/perl

use Switch;
use strict;

use HTML::Query 'Query';

my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';

opendir my $dh, $dir or die "Can't open $dir: $!";
my @files = map {"$dir/$_"} grep { $_ !~ /^\./ } readdir $dh;
closedir $dh;

my $total;

my %xlateNum2Text =  qw (0   January
                         1   Febuary
                         2   March
                         3   April
                         4   May
                         5   June
                         6   July
                         7   August
                         8   September
                         9   October
                         10  November
                         11  December                  
                       );


my $inc = 0;
foreach my $file (@files) {
    open FILE, $file;
    my $html = do { local $/; <FILE> };
    my $q = Query(text => $html);

    my @homescore = $q->query("span.homeScore");
    my @awayscore = $q->query("span.awayScore");
    my $singlehomescore = $homescore[0]->as_text();
    my $singleawayscore = $homescore[0]->as_text();

    my @hometeam = $q->query("table.teaminfo td.home span");
    my @awayteam = $q->query("table.teaminfo td.away span");
    my $singlehometeam = rightTeamName($hometeam[0]->as_text());
    my $singleawayteam = rightTeamName($awayteam[0]->as_text());


    my @homegoalstotal;
    my @awaygoalstotal;
    my @datearray;
    my @fixtureinfo;

    my @newhomegoals;
    my @newawaygoals;

    my @allinfogoals;

    if($singlehomescore ne "0" || $singleawayscore ne "0") {
        @homegoalstotal = $q->query("div.home ul li");
        @awaygoalstotal = $q->query("div.away ul li");
        my $i = 0;

        @datearray = $q->query("p.fixtureinfo span");
        my $finaldate = $datearray[0]->as_text();
        my @datecomponents = split(" ", $finaldate);
        my $mysqlyyyy = $datecomponents[3];
        my $mysqlmm = monthConvert($datecomponents[2]); 
        my $mysqldd = $datecomponents[1];

        my $mysqldate;

        if(length($mysqlmm) == 1) {
            $mysqlmm = "0".$mysqlmm;
        }

        if(length($mysqldd) == 1) {
            $mysqldd = "0".$mysqldd;
        }

        $mysqldate = $mysqlyyyy."-".$mysqlmm."-".$mysqldd;


        @fixtureinfo = $q->query("p.fixtureinfo");
        my $fixtureinfoinit = $fixtureinfo[0]->as_text();
        my @fixtureinfobrokenup = split(/ \| /, $fixtureinfoinit);
        my $fixtureinfostring = $fixtureinfobrokenup[1];

        foreach my $goal (@homegoalstotal) {
            my $tempmodifier = $goal->as_text();
            $tempmodifier =~ s/\)//g;
            my @tempcomponents = split(' \(', $tempmodifier);
            my $substitutetemp;
            my @extratimesplit;
            my $compositetime;

            if(index($tempcomponents[1], ",") != -1) {
                my @goaltimes = split('\,', $tempcomponents[1]);
                foreach my $individmultgoal (@goaltimes) {
                    $individmultgoal =~ s/Pen//g;
                    $individmultgoal =~ s/ //g;
                    if(index($individmultgoal, "OG") == -1) {
                        if(index($individmultgoal, "+") != -1) {
                            @extratimesplit = split('\+', $individmultgoal);
                            $compositetime = $extratimesplit[0];
                            push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
                            $i++;
                        } else {
                            push (@{$allinfogoals[$i]}, ($tempcomponents[0], $individmultgoal, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                            $i++;
                        }   
                    }
                }
            } else {
                $substitutetemp = $tempcomponents[1];
                $substitutetemp =~ s/Pen//g;
                $substitutetemp =~ s/ //g;
                if(index($substitutetemp, "OG") == -1) {
                    if(index($substitutetemp, "+") != -1) {
                        @extratimesplit = split('\+', $substitutetemp);
                        $compositetime = $extratimesplit[0];
                        push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
                        $i++;
                    } else {
                        push (@{$allinfogoals[$i]}, ($tempcomponents[0], $substitutetemp, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                        $i++;
                    }
                }
            }
        }

        foreach my $goal (@awaygoalstotal) {
            my $tempmodifier2 = $goal->as_text();
            $tempmodifier2 =~ s/\)//g;
            my @tempcomponents2 = split(' \(', $tempmodifier2);
            my $substitutetemp2;
            my @extratimesplit2;
            my $compositetime2;

            if(index($tempcomponents2[1], ",") != -1) {
                my @goaltimes2 = split('\,', $tempcomponents2[1]);
                foreach my $individmultgoal2 (@goaltimes2) {
                    $individmultgoal2 =~ s/Pen//g;
                    $individmultgoal2 =~ s/ //g;
                    if(index($individmultgoal2, "OG") == -1) {
                        if(index($individmultgoal2, "+") != -1) {
                            @extratimesplit2 = split('\+', $individmultgoal2);
                            $compositetime2 = $extratimesplit2[0];
                            push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
                            $i++;
                        } else {
                            push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $individmultgoal2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                            $i++;
                        }
                    }
                }
            } else {
                $substitutetemp2 = $tempcomponents2[1];
                $substitutetemp2 =~ s/Pen//g;
                $substitutetemp2 =~ s/ //g;
                if(index($substitutetemp2, "OG") == -1) {
                    if(index($substitutetemp2, "+") != -1) {
                        @extratimesplit2 = split('\+', $substitutetemp2);
                        $compositetime2 = $extratimesplit2[0];
                        push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
                        $i++;
                    } else {
                        push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $substitutetemp2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
                        $i++;
                    }
                }
            }
        }


        @allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;

        open(GOALCSV, '>>goalcsv.txt');

        my $homegoalcount = 0;
        my $awaygoalcount = 0;

        foreach my $row(@allinfogoals){
           foreach my $val(@$row){
                if($val eq "for:".$singlehometeam) {
                    $homegoalcount++;
                    print GOALCSV "$val,".$homegoalcount.",".$awaygoalcount.",true,";
                    print "$val,".$homegoalcount.",".$awaygoalcount.",true,";
                } elsif($val eq "for:".$singleawayteam) {
                    $awaygoalcount++;
                    print GOALCSV "$val,".$awaygoalcount.",".$homegoalcount.",false,";
                    print "$val,".$awaygoalcount.",".$homegoalcount.",false,";
                } else {
                    print GOALCSV "$val,";
                    print "$val,";
                }
           }
           print GOALCSV "\n";
           print "\n";
        }
    }

}

sub rightTeamName{
    my $teamname = $_[0];

    switch($teamname) {
        case "Nott'm Forest" { return "Nottingham Forest" }
        case "QPR"  { return "Queens Park Rangers" }
        case "Southampton" { return "Southampton FC" }
        case "Norwich" { return "Norwich City" }
        case "Tottenham" { return "Tottenham Hotspur" }
        case "Leeds" { return "Leeds United" }
        case "Middlesbrough" { return "Middlesbrough FC" }
        case "Chelsea" { return "Chelsea FC" }
        case "Arsenal" { return "Arsenal FC" }
        case "Oldham" { return "Oldham Athletic" }
        case "Ipswich" { return "Ipswich Town" }
        case "Man Utd" { return "Manchester United" }
        case "Man City" { return "Manchester City" }
        case "Sheffield Wed" { return "Sheffield Wednesday" }
        case "Man City" { return "Manchester City" }
        case "Blackburn" { return "Blackburn Rovers" }
        case "Wimbledon" { return "AFC Wimbledon" }
        case "Liverpool" { return "Liverpool FC" }
        case "Coventry" { return "Coventry City" }
        else        { return $teamname }

    }
}

sub monthConvert{
        switch($_[0]) {
            case "January" { return 1 }
            case "February" { return 2 }
            case "March" { return 3 }
            case "April" { return 4 }
            case "May" { return 5 }
            case "June" { return 6 }
            case "July" { return 7 }
            case "August" { return 8 }
            case "September" { return 9 }
            case "October" { return 10 }
            case "November" { return 11}
            case "December" { return 12 }
        }
}

2 个答案:

答案 0 :(得分:3)

很可能一个或多个文件非常大。

在您浏览文件时打印出文件的名称。你会看到你的代码每次都会中断其中一个。

答案 1 :(得分:3)

HTML :: Query使用HTML :: Element和HTML :: TreeBuilder来为文档的节点建模。节点以复杂的方式连接,使Perl垃圾收集器无法清理节点。因此,您必须

  • 断言你有一个可以使用弱引用的HTML :: Element版本。这些不会阻止垃圾回收。 use HTML::TreeBuilder 5 -weak应该可以解决问题。

  • delete方法的任何结果致电query

有关详细信息,请参阅文档(例如,在HTML::Element中)。

以下是脚本的清理版本,它试图减少代码重复(原始代码中有明显的复制和粘贴迹象)。它仍然不漂亮,有些WTF仍然存在,但它应该是可维护性的改进。值得注意的是,我不知道@allinfogoals的第12列是什么(在排序中),或者为什么以这种相当奇怪的方式发出CSV(我们已经知道for:的索引列(→2),因此我们没有将每列与预期值匹配。)

提示理解一些缺少的if-elses:当一个字符串不包含某个子字符串时,那么在该子字符串上拆分字符串的返回值等于原始字符串。代码:

use Test::More;
my ($string, $substring) = ("foo+bar", "-"); # try it yourself!
my ($split) = split /\Q$substring\E/, $string;
if (-1 == index $string, $substring) {
  is $split, $string;
} else {
  isnt $split, $string;
}
done_testing;

以下是已清理的版本:

#!/usr/bin/perl

use strict; use warnings;

use HTML::TreeBuilder 5 -weak;
use HTML::Query;

my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';

opendir my $dh, $dir or die "Can't open $dir: $!";

while (my $filename = readdir $dh) {
    next if $filename =~ /^\./;
    my $q = HTML::Query->new(file => "$dir/$filename");

    my $homescore = $q->query("span.homeScore")->first->as_text;
    my $awayscore = $q->query("span.awayScore")->first->as_text;

    my $hometeam = correctTeamName($q->query("table.teaminfo td.home span")->first->as_text);
    my $awayteam = correctTeamName($q->query("table.teaminfo td.away span")->first->as_text);

    my @allinfogoals;

    if($homescore ne "0" || $awayscore ne "0") {

        my ($fixtureinfo_span) = $q->query("p.fixtureinfo span");
        my (undef, $day, $month, $year) = split ' ', $fixtureinfo_span->as_text;
        my $mysqldate = sprintf '%04d-%02d-%02d', $year, monthConvert($month), $day;

        my ($fixtureinfo) = $q->query('p.fixtureinfo');
        my (undef, $fixtureinfostring) = split / \| /, $fixtureinfo->as_text;

        for my $goal_list (
            [$hometeam, $awayteam, [$q->query("div.home ul li")->as_text]],
            [$awayteam, $hometeam, [$q->query("div.away ul li")->as_text]]
        ) {
            my ($thisteam, $otherteam, $goalstotal) = @$goal_list;
            for my $goal (@$goalstotal) {
                $goal =~ s/\)//g;
                my ($tempcomponent_1, $tempcomponent) = split / \(/, $goal;

                for my $individmultgoal (split/,/, $tempcomponent) {
                    next if -1 != index $individmultgoal, 'OG';
                    $individmultgoal =~ s/Pen//g;
                    $individmultgoal =~ s/ //g;
                    my @timesplit = 
                        (index($individmultgoal, "+") != -1)
                        ? (split /\+/, $individmultgoal)
                        : ($individmultgoal, 0);
                    push @allinfogoals, [
                        $tempcomponent_1,
                        $timesplit[0],
                        "for:$thisteam",
                        $otherteam,
                        $day,
                        $month,
                        $year,
                        $fixtureinfostring,
                        "Barclays Premier League",
                        monthConvert($month),
                        $mysqldate,
                        $timesplit[1],
                    ];
                }
            }
        }

        @allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;

        open my $GOALCSV, '>>', 'goalcsv.txt' or die "Can't open goalcsv.txt: $!";

        my $print_both = sub {
            print {$GOALCSV} @_;
            print            @_;
        };

        my $homegoalcount = 0;
        my $awaygoalcount = 0;

        for my $row (@allinfogoals){
            for my $val(@$row){
                if($val eq "for:$hometeam") {
                    $homegoalcount++;
                    $print_both->("$val,$homegoalcount,$awaygoalcount,true,");
                } elsif($val eq "for:$awayteam") {
                    $awaygoalcount++;
                    $print_both->("$val,$awaygoalcount,$homegoalcount,false,");
                } else {
                    $print_both->("$val,");
                }
            }
            $print_both->("\n");
        }
    }
}

closedir $dh;

sub correctTeamName{
    my %teamnames = (
        "Nott'm Forest" => "Nottingham Forest",
        "QPR"           => "Queens Park Rangers",
        "Southampton"   => "Southampton FC",
        "Norwich"       => "Norwich City",
        "Tottenham"     => "Tottenham Hotspur",
        "Leeds"         => "Leeds United",
        "Middlesbrough" => "Middlesbrough FC",
        "Chelsea"       => "Chelsea FC",
        "Arsenal"       => "Arsenal FC",
        "Oldham"        => "Oldham Athletic",
        "Ipswich"       => "Ipswich Town",
        "Man Utd"       => "Manchester United",
        "Man City"      => "Manchester City",
        "Sheffield Wed" => "Sheffield Wednesday",
        "Man City"      => "Manchester City",
        "Blackburn"     => "Blackburn Rovers",
        "Wimbledon"     => "AFC Wimbledon",
        "Liverpool"     => "Liverpool FC",
        "Coventry"      => "Coventry City",
    );
    return exists $teamnames{$_[1]} ? $teamnames{$_[1]} : $_[0];
}

sub monthConvert{
    my $i = 1;
    my %months = map { $_ => $i++ } qw/
        January February    March
        April   May         June
        July    August      September
        October November    December
    /;
    exists $months{$_[0]} or die "Unknown month name $_[0]";
    return $months{$_[0]};
}

注意:代码未经测试,因为未提供示例文件。至少它编译。<​​/ p>