我刚才在Ask Ubuntu问this question,并被指示在这里提供一些代码,因为它可能是一个优化问题。我已经包含了整个脚本。
总体目标是在目录中浏览大约7000个HTML文件并从中解析特定信息,并将其作为一行导出到文本文件中。
#!/usr/bin/perl
use Switch;
use strict;
use HTML::Query 'Query';
my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';
opendir my $dh, $dir or die "Can't open $dir: $!";
my @files = map {"$dir/$_"} grep { $_ !~ /^\./ } readdir $dh;
closedir $dh;
my $total;
my %xlateNum2Text = qw (0 January
1 Febuary
2 March
3 April
4 May
5 June
6 July
7 August
8 September
9 October
10 November
11 December
);
my $inc = 0;
foreach my $file (@files) {
open FILE, $file;
my $html = do { local $/; <FILE> };
my $q = Query(text => $html);
my @homescore = $q->query("span.homeScore");
my @awayscore = $q->query("span.awayScore");
my $singlehomescore = $homescore[0]->as_text();
my $singleawayscore = $homescore[0]->as_text();
my @hometeam = $q->query("table.teaminfo td.home span");
my @awayteam = $q->query("table.teaminfo td.away span");
my $singlehometeam = rightTeamName($hometeam[0]->as_text());
my $singleawayteam = rightTeamName($awayteam[0]->as_text());
my @homegoalstotal;
my @awaygoalstotal;
my @datearray;
my @fixtureinfo;
my @newhomegoals;
my @newawaygoals;
my @allinfogoals;
if($singlehomescore ne "0" || $singleawayscore ne "0") {
@homegoalstotal = $q->query("div.home ul li");
@awaygoalstotal = $q->query("div.away ul li");
my $i = 0;
@datearray = $q->query("p.fixtureinfo span");
my $finaldate = $datearray[0]->as_text();
my @datecomponents = split(" ", $finaldate);
my $mysqlyyyy = $datecomponents[3];
my $mysqlmm = monthConvert($datecomponents[2]);
my $mysqldd = $datecomponents[1];
my $mysqldate;
if(length($mysqlmm) == 1) {
$mysqlmm = "0".$mysqlmm;
}
if(length($mysqldd) == 1) {
$mysqldd = "0".$mysqldd;
}
$mysqldate = $mysqlyyyy."-".$mysqlmm."-".$mysqldd;
@fixtureinfo = $q->query("p.fixtureinfo");
my $fixtureinfoinit = $fixtureinfo[0]->as_text();
my @fixtureinfobrokenup = split(/ \| /, $fixtureinfoinit);
my $fixtureinfostring = $fixtureinfobrokenup[1];
foreach my $goal (@homegoalstotal) {
my $tempmodifier = $goal->as_text();
$tempmodifier =~ s/\)//g;
my @tempcomponents = split(' \(', $tempmodifier);
my $substitutetemp;
my @extratimesplit;
my $compositetime;
if(index($tempcomponents[1], ",") != -1) {
my @goaltimes = split('\,', $tempcomponents[1]);
foreach my $individmultgoal (@goaltimes) {
$individmultgoal =~ s/Pen//g;
$individmultgoal =~ s/ //g;
if(index($individmultgoal, "OG") == -1) {
if(index($individmultgoal, "+") != -1) {
@extratimesplit = split('\+', $individmultgoal);
$compositetime = $extratimesplit[0];
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
$i++;
} else {
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $individmultgoal, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], ,$datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
} else {
$substitutetemp = $tempcomponents[1];
$substitutetemp =~ s/Pen//g;
$substitutetemp =~ s/ //g;
if(index($substitutetemp, "OG") == -1) {
if(index($substitutetemp, "+") != -1) {
@extratimesplit = split('\+', $substitutetemp);
$compositetime = $extratimesplit[0];
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $compositetime, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit[1]));
$i++;
} else {
push (@{$allinfogoals[$i]}, ($tempcomponents[0], $substitutetemp, "for:".$singlehometeam, $singleawayteam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
}
foreach my $goal (@awaygoalstotal) {
my $tempmodifier2 = $goal->as_text();
$tempmodifier2 =~ s/\)//g;
my @tempcomponents2 = split(' \(', $tempmodifier2);
my $substitutetemp2;
my @extratimesplit2;
my $compositetime2;
if(index($tempcomponents2[1], ",") != -1) {
my @goaltimes2 = split('\,', $tempcomponents2[1]);
foreach my $individmultgoal2 (@goaltimes2) {
$individmultgoal2 =~ s/Pen//g;
$individmultgoal2 =~ s/ //g;
if(index($individmultgoal2, "OG") == -1) {
if(index($individmultgoal2, "+") != -1) {
@extratimesplit2 = split('\+', $individmultgoal2);
$compositetime2 = $extratimesplit2[0];
push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
$i++;
} else {
push (@{$allinfogoals[$i]}, ($tempcomponents2[0], $individmultgoal2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
} else {
$substitutetemp2 = $tempcomponents2[1];
$substitutetemp2 =~ s/Pen//g;
$substitutetemp2 =~ s/ //g;
if(index($substitutetemp2, "OG") == -1) {
if(index($substitutetemp2, "+") != -1) {
@extratimesplit2 = split('\+', $substitutetemp2);
$compositetime2 = $extratimesplit2[0];
push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $compositetime2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, $extratimesplit2[1]));
$i++;
} else {
push(@{$allinfogoals[$i]}, ($tempcomponents2[0], $substitutetemp2, "for:".$singleawayteam, $singlehometeam, $datecomponents[1], $datecomponents[2], $datecomponents[3], $fixtureinfostring, "Barclays Premier League", monthConvert($datecomponents[2]), $mysqldate, 0));
$i++;
}
}
}
}
@allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;
open(GOALCSV, '>>goalcsv.txt');
my $homegoalcount = 0;
my $awaygoalcount = 0;
foreach my $row(@allinfogoals){
foreach my $val(@$row){
if($val eq "for:".$singlehometeam) {
$homegoalcount++;
print GOALCSV "$val,".$homegoalcount.",".$awaygoalcount.",true,";
print "$val,".$homegoalcount.",".$awaygoalcount.",true,";
} elsif($val eq "for:".$singleawayteam) {
$awaygoalcount++;
print GOALCSV "$val,".$awaygoalcount.",".$homegoalcount.",false,";
print "$val,".$awaygoalcount.",".$homegoalcount.",false,";
} else {
print GOALCSV "$val,";
print "$val,";
}
}
print GOALCSV "\n";
print "\n";
}
}
}
sub rightTeamName{
my $teamname = $_[0];
switch($teamname) {
case "Nott'm Forest" { return "Nottingham Forest" }
case "QPR" { return "Queens Park Rangers" }
case "Southampton" { return "Southampton FC" }
case "Norwich" { return "Norwich City" }
case "Tottenham" { return "Tottenham Hotspur" }
case "Leeds" { return "Leeds United" }
case "Middlesbrough" { return "Middlesbrough FC" }
case "Chelsea" { return "Chelsea FC" }
case "Arsenal" { return "Arsenal FC" }
case "Oldham" { return "Oldham Athletic" }
case "Ipswich" { return "Ipswich Town" }
case "Man Utd" { return "Manchester United" }
case "Man City" { return "Manchester City" }
case "Sheffield Wed" { return "Sheffield Wednesday" }
case "Man City" { return "Manchester City" }
case "Blackburn" { return "Blackburn Rovers" }
case "Wimbledon" { return "AFC Wimbledon" }
case "Liverpool" { return "Liverpool FC" }
case "Coventry" { return "Coventry City" }
else { return $teamname }
}
}
sub monthConvert{
switch($_[0]) {
case "January" { return 1 }
case "February" { return 2 }
case "March" { return 3 }
case "April" { return 4 }
case "May" { return 5 }
case "June" { return 6 }
case "July" { return 7 }
case "August" { return 8 }
case "September" { return 9 }
case "October" { return 10 }
case "November" { return 11}
case "December" { return 12 }
}
}
答案 0 :(得分:3)
很可能一个或多个文件非常大。
在您浏览文件时打印出文件的名称。你会看到你的代码每次都会中断其中一个。
答案 1 :(得分:3)
HTML :: Query使用HTML :: Element和HTML :: TreeBuilder来为文档的节点建模。节点以复杂的方式连接,使Perl垃圾收集器无法清理节点。因此,您必须
断言你有一个可以使用弱引用的HTML :: Element版本。这些不会阻止垃圾回收。 use HTML::TreeBuilder 5 -weak
应该可以解决问题。
就delete
方法的任何结果致电query
。
有关详细信息,请参阅文档(例如,在HTML::Element中)。
以下是脚本的清理版本,它试图减少代码重复(原始代码中有明显的复制和粘贴迹象)。它仍然不漂亮,有些WTF仍然存在,但它应该是可维护性的改进。值得注意的是,我不知道@allinfogoals
的第12列是什么(在排序中),或者为什么以这种相当奇怪的方式发出CSV(我们已经知道for:
的索引列(→2),因此我们没有将每列与预期值匹配。)
提示理解一些缺少的if-elses:当一个字符串不包含某个子字符串时,那么在该子字符串上拆分字符串的返回值等于原始字符串。代码:
use Test::More;
my ($string, $substring) = ("foo+bar", "-"); # try it yourself!
my ($split) = split /\Q$substring\E/, $string;
if (-1 == index $string, $substring) {
is $split, $string;
} else {
isnt $split, $string;
}
done_testing;
以下是已清理的版本:
#!/usr/bin/perl
use strict; use warnings;
use HTML::TreeBuilder 5 -weak;
use HTML::Query;
my $dir = '/home/mark/Documents/Perl/garchivesfiles/completeresults';
opendir my $dh, $dir or die "Can't open $dir: $!";
while (my $filename = readdir $dh) {
next if $filename =~ /^\./;
my $q = HTML::Query->new(file => "$dir/$filename");
my $homescore = $q->query("span.homeScore")->first->as_text;
my $awayscore = $q->query("span.awayScore")->first->as_text;
my $hometeam = correctTeamName($q->query("table.teaminfo td.home span")->first->as_text);
my $awayteam = correctTeamName($q->query("table.teaminfo td.away span")->first->as_text);
my @allinfogoals;
if($homescore ne "0" || $awayscore ne "0") {
my ($fixtureinfo_span) = $q->query("p.fixtureinfo span");
my (undef, $day, $month, $year) = split ' ', $fixtureinfo_span->as_text;
my $mysqldate = sprintf '%04d-%02d-%02d', $year, monthConvert($month), $day;
my ($fixtureinfo) = $q->query('p.fixtureinfo');
my (undef, $fixtureinfostring) = split / \| /, $fixtureinfo->as_text;
for my $goal_list (
[$hometeam, $awayteam, [$q->query("div.home ul li")->as_text]],
[$awayteam, $hometeam, [$q->query("div.away ul li")->as_text]]
) {
my ($thisteam, $otherteam, $goalstotal) = @$goal_list;
for my $goal (@$goalstotal) {
$goal =~ s/\)//g;
my ($tempcomponent_1, $tempcomponent) = split / \(/, $goal;
for my $individmultgoal (split/,/, $tempcomponent) {
next if -1 != index $individmultgoal, 'OG';
$individmultgoal =~ s/Pen//g;
$individmultgoal =~ s/ //g;
my @timesplit =
(index($individmultgoal, "+") != -1)
? (split /\+/, $individmultgoal)
: ($individmultgoal, 0);
push @allinfogoals, [
$tempcomponent_1,
$timesplit[0],
"for:$thisteam",
$otherteam,
$day,
$month,
$year,
$fixtureinfostring,
"Barclays Premier League",
monthConvert($month),
$mysqldate,
$timesplit[1],
];
}
}
}
@allinfogoals = sort { $a->[1] <=> $b->[1] || $a->[12] <=> $b->[12] } @allinfogoals;
open my $GOALCSV, '>>', 'goalcsv.txt' or die "Can't open goalcsv.txt: $!";
my $print_both = sub {
print {$GOALCSV} @_;
print @_;
};
my $homegoalcount = 0;
my $awaygoalcount = 0;
for my $row (@allinfogoals){
for my $val(@$row){
if($val eq "for:$hometeam") {
$homegoalcount++;
$print_both->("$val,$homegoalcount,$awaygoalcount,true,");
} elsif($val eq "for:$awayteam") {
$awaygoalcount++;
$print_both->("$val,$awaygoalcount,$homegoalcount,false,");
} else {
$print_both->("$val,");
}
}
$print_both->("\n");
}
}
}
closedir $dh;
sub correctTeamName{
my %teamnames = (
"Nott'm Forest" => "Nottingham Forest",
"QPR" => "Queens Park Rangers",
"Southampton" => "Southampton FC",
"Norwich" => "Norwich City",
"Tottenham" => "Tottenham Hotspur",
"Leeds" => "Leeds United",
"Middlesbrough" => "Middlesbrough FC",
"Chelsea" => "Chelsea FC",
"Arsenal" => "Arsenal FC",
"Oldham" => "Oldham Athletic",
"Ipswich" => "Ipswich Town",
"Man Utd" => "Manchester United",
"Man City" => "Manchester City",
"Sheffield Wed" => "Sheffield Wednesday",
"Man City" => "Manchester City",
"Blackburn" => "Blackburn Rovers",
"Wimbledon" => "AFC Wimbledon",
"Liverpool" => "Liverpool FC",
"Coventry" => "Coventry City",
);
return exists $teamnames{$_[1]} ? $teamnames{$_[1]} : $_[0];
}
sub monthConvert{
my $i = 1;
my %months = map { $_ => $i++ } qw/
January February March
April May June
July August September
October November December
/;
exists $months{$_[0]} or die "Unknown month name $_[0]";
return $months{$_[0]};
}
注意:代码未经测试,因为未提供示例文件。至少它编译。</ p>