每当我运行以下Perl脚本时,我都会遇到以下错误
Use of uninitialized value $date in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
Use of uninitialized value $first_page in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
Use of uninitialized value $last_page in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
通过提供URL
在命令prmpt上运行以下代码http://ajpheart.physiology.org/content/309/11
它生成meta_issue11.xml
文件,但没有提供正确的输出。
#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
use HTML::Parser;
use WWW::Mechanize;
my ( $date, $first_page, $last_page, @toc );
sub get_date {
my ( $self, $tag, $attr ) = @_;
if ( 'span' eq $tag
and $attr->{class}
and 'highwire-cite-metadata-date' eq $attr->{class}
and not defined $date )
{
$self->handler( text => \&next_text_to_date, 'self, text' );
}
elsif ( 'span' eq $tag
and $attr->{class}
and 'highwire-cite-metadata-pages' eq $attr->{class} )
{
if ( not defined $first_page ) {
$self->handler( text => \&parse_first_page, 'self, text' );
}
else {
$self->handler( text => \&parse_last_page, 'self, text' );
}
}
elsif ( 'span' eq $tag
and $attr->{class}
and 'highwire-cite-metadata-doi' eq $attr->{class} )
{
$self->handler( text => \&retrieve_doi, 'self, text' );
}
elsif ( 'div' eq $tag
and $attr->{class}
and $attr->{class} =~ /\bissue-toc-section\b/ )
{
$self->handler( text => \&next_text_to_toc, 'self, text' );
}
}
sub next_text_to_date {
my ( $self, $text ) = @_;
$text =~ s/^\s+|\s+$//g;
$date = $text;
$self->handler( text => undef );
}
sub parse_first_page {
my ( $self, $text ) = @_;
if ( $text =~ /([A-Z0-9]+)(?:-[0-9A-Z]+)?/ ) {
$first_page = $1;
$self->handler( text => undef );
}
}
sub parse_last_page {
my ( $self, $text ) = @_;
if ( $text =~ /(?:[A-Z0-9]+-)?([0-9A-Z]+)/ ) {
$last_page = $1;
$self->handler( text => undef );
}
}
sub next_text_to_toc {
my ( $self, $text ) = @_;
push @toc, [$text];
$self->handler( text => undef );
}
sub retrieve_doi {
my ( $self, $text ) = @_;
if ( 'DOI:' ne $text ) {
$text =~ s/^\s+|\s+$//g;
push @{ $toc[-1] }, $text;
$self->handler( text => undef );
}
}
print STDERR 'Enter the URL: ';
chomp( my $url = <> );
my ( $volume, $issue ) = ( split m(/), $url )[ -2, -1 ];
my $p = 'HTML::Parser'->new(
api_version => 3,
start_h => [ \&get_date, 'self, tagname, attr' ],
);
my $mech = 'WWW::Mechanize'->new( agent => 'Mozilla' );
$mech->get( $url );
my $contents = $mech->content;
$p->parse( $contents );
$p->eof;
my $toc;
for my $section ( @toc ) {
$toc .= "<TocSection>\n";
$toc .= "<Heading>" . shift( @$section ) . "</Heading>\n";
$toc .= join q(), map "<DOI>$_</DOI>\n", @$section;
$toc .= "</TocSection>\n";
}
open( F6, ">meta_issue_$issue.xml" );
print F6 <<"__HTML__";
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="$volume" issue="$issue">
<Provider>Cadmus</Provider>
<IssueDate>$date</IssueDate>
<PageRange>$first_page-$last_page</PageRange>
<TOC>$toc</TOC>
</MetaIssue>
__HTML__
答案 0 :(得分:3)
主要问题是您正在检查相等的类字符串,而所需的类可能只是几个以空格分隔的类名之一
但是还存在许多其他问题,例如在LWP::Simple
完成时只使用WWW::Mechanize
获取网页。并为'span' eq $tag
这是一个有效的版本。我更希望看到XML::Writer
用于创建输出XML,但我一直使用简单的print
语句,就像在您自己的代码中一样
注意 像#/
这样的评论只是为了说服Stack Overflow语法高亮显示器正确地为文本着色。您应该在实时代码中删除它们
#!/usr/bin/perl
use strict;
use warnings 'all';
use LWP::Simple 'get';
use HTML::Parser;
my ( $date, $first_page, $last_page, @toc );
print 'Enter the URL: ';
my $url = <>;
$url ||= 'http://ajpheart.physiology.org/content/309/11';
chomp $url;
my ( $volume, $issue ) = ( split m(/), $url )[ -2, -1 ]; #/
my $p = 'HTML::Parser'->new(
api_version => 3,
start_h => [ \&get_span_div, 'self, tagname, attr' ],
);
my $contents = get($url);
$p->parse( $contents );
$p->eof;
my $toc = '';
for my $section ( @toc ) {
$toc .= "\n";
$toc .= " <TocSection>\n";
$toc .= " <Heading>" . shift( @$section ) . "</Heading>\n";
$toc .= " <DOI>$_</DOI>\n" for @$section;
$toc .= " </TocSection>";
}
open my $out_fh, '>', "meta_issue_$issue.xml" or die $!;
print { $out_fh } <<"__HTML__";
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="$volume" issue="$issue">
<Provider>Cadmus</Provider>
<IssueDate>$date</IssueDate>
<PageRange>$first_page-$last_page</PageRange>
<TOC>$toc
</TOC>
</MetaIssue>
__HTML__
#/
sub get_span_div {
my ( $self, $tag, $attr ) = @_;
my $class = $attr->{class};
my %class;
%class = map { $_ => 1 } split ' ', $class if $class;
if ( $tag eq 'span' ) {
if ( $class{'highwire-cite-metadata-date'} ) {
$self->handler( text => \&next_text_to_date, 'self, text' ) unless $date;
}
elsif ( $class{'highwire-cite-metadata-pages'} ) {
if ( not defined $first_page ) {
$self->handler( text => \&parse_first_page, 'self, text' );
}
else {
$self->handler( text => \&parse_last_page, 'self, text' );
}
}
elsif ( $class{'highwire-cite-metadata-doi'} ) {
$self->handler( text => \&retrieve_doi, 'self, text' );
}
}
elsif ( $tag eq 'div' ) {
if ( $class{'issue-toc-section'} ) {
$self->handler( text => \&next_text_to_toc, 'self, text' );
}
}
}
sub next_text_to_date {
my ( $self, $text ) = @_;
($date = $text) =~ s/^\s+|\s+$//g; #/
$self->handler( text => undef );
}
sub parse_first_page {
my ( $self, $text ) = @_;
return unless $text =~ /(\w+)(-\w+)?/; #/
$first_page = $1;
$self->handler( text => undef );
}
sub parse_last_page {
my ( $self, $text ) = @_;
return unless $text =~ /\w+-(\w+)/; #/
$last_page = $1;
$self->handler( text => undef );
}
sub next_text_to_toc {
my ( $self, $text ) = @_;
push @toc, [ $text ];
$self->handler( text => undef );
}
sub retrieve_doi {
my ( $self, $text ) = @_;
return unless $text =~ /\d+/; #/
$text =~ s/^\s+|\s+$//g;
push @{ $toc[-1] }, $text;
$self->handler( text => undef );
}
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="309" issue="11">
<Provider>Cadmus</Provider>
<IssueDate>December 1, 2015</IssueDate>
<PageRange>H1793-H1996</PageRange>
<TOC>
<TocSection>
<Heading>CALL FOR PAPERS | Cardiovascular Responses to Environmental Stress</Heading>
<DOI>10.1152/ajpheart.00199.2015</DOI>
</TocSection>
<TocSection>
<Heading>CALL FOR PAPERS | Autophagy in the Cardiovascular System</Heading>
<DOI>10.1152/ajpheart.00709.2014</DOI>
</TocSection>
<TocSection>
<Heading>CALL FOR PAPERS | Mechanisms of Diastolic Dysfunction in Cardiovascular Disease</Heading>
<DOI>10.1152/ajpheart.00608.2015</DOI>
</TocSection>
<TocSection>
<Heading>CALL FOR PAPERS | Small Vessels–Big Problems: Novel Insights into Microvascular Mechanisms of Diseases</Heading>
<DOI>10.1152/ajpheart.00463.2015</DOI>
<DOI>10.1152/ajpheart.00691.2015</DOI>
<DOI>10.1152/ajpheart.00568.2015</DOI>
<DOI>10.1152/ajpheart.00653.2015</DOI>
</TocSection>
<TocSection>
<Heading>CALL FOR PAPERS | Exercise Training in Cardiovascular Disease: Mechanisms and Outcomes</Heading>
<DOI>10.1152/ajpheart.00341.2015</DOI>
</TocSection>
<TocSection>
<Heading>CALL FOR PAPERS | Cardiac Regeneration and Repair: Mechanisms and Therapy</Heading>
<DOI>10.1152/ajpheart.00594.2015</DOI>
</TocSection>
<TocSection>
<Heading>Vascular Biology and Microcirculation</Heading>
<DOI>10.1152/ajpheart.00289.2015</DOI>
<DOI>10.1152/ajpheart.00308.2015</DOI>
<DOI>10.1152/ajpheart.00179.2015</DOI>
</TocSection>
<TocSection>
<Heading>Muscle Mechanics and Ventricular Function</Heading>
<DOI>10.1152/ajpheart.00284.2015</DOI>
<DOI>10.1152/ajpheart.00327.2015</DOI>
</TocSection>
<TocSection>
<Heading>Signaling and Stress Response</Heading>
<DOI>10.1152/ajpheart.00050.2015</DOI>
</TocSection>
<TocSection>
<Heading>Cardiac Excitation and Contraction</Heading>
<DOI>10.1152/ajpheart.00055.2015</DOI>
</TocSection>
<TocSection>
<Heading>Integrative Cardiovascular Physiology and Pathophysiology</Heading>
<DOI>10.1152/ajpheart.00316.2015</DOI>
<DOI>10.1152/ajpheart.00721.2014</DOI>
</TocSection>
<TocSection>
<Heading>Corrigendum</Heading>
<DOI>10.1152/ajpheart.H-zh4-1780-corr.2015</DOI>
</TocSection>
</TOC>
</MetaIssue>