
时间:2015-12-12 04:41:15

标签: perl perl-module


Use of uninitialized value $date in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
Use of uninitialized value $first_page in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
Use of uninitialized value $last_page in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1. 




use warnings;
use strict;
use feature qw{ say };

use HTML::Parser;
use WWW::Mechanize;

my ( $date, $first_page, $last_page, @toc );

sub get_date {
    my ( $self, $tag, $attr ) = @_;

    if ( 'span' eq $tag
        and $attr->{class}
        and 'highwire-cite-metadata-date' eq $attr->{class}
        and not defined $date )

        $self->handler( text => \&next_text_to_date, 'self, text' );

    elsif ( 'span' eq $tag
        and $attr->{class}
        and 'highwire-cite-metadata-pages' eq $attr->{class} )
        if ( not defined $first_page ) {
            $self->handler( text => \&parse_first_page, 'self, text' );
        else {
            $self->handler( text => \&parse_last_page, 'self, text' );

    elsif ( 'span' eq $tag
        and $attr->{class}
        and 'highwire-cite-metadata-doi' eq $attr->{class} )
        $self->handler( text => \&retrieve_doi, 'self, text' );

    elsif ( 'div' eq $tag
        and $attr->{class}
        and $attr->{class} =~ /\bissue-toc-section\b/ )
        $self->handler( text => \&next_text_to_toc, 'self, text' );

sub next_text_to_date {
    my ( $self, $text ) = @_;

    $text =~ s/^\s+|\s+$//g;
    $date = $text;
    $self->handler( text => undef );

sub parse_first_page {
    my ( $self, $text ) = @_;

    if ( $text =~ /([A-Z0-9]+)(?:-[0-9A-Z]+)?/ ) {
        $first_page = $1;
        $self->handler( text => undef );

sub parse_last_page {
    my ( $self, $text ) = @_;

    if ( $text =~ /(?:[A-Z0-9]+-)?([0-9A-Z]+)/ ) {
        $last_page = $1;
        $self->handler( text => undef );

sub next_text_to_toc {
    my ( $self, $text ) = @_;

    push @toc, [$text];
    $self->handler( text => undef );

sub retrieve_doi {
    my ( $self, $text ) = @_;

    if ( 'DOI:' ne $text ) {
        $text =~ s/^\s+|\s+$//g;
        push @{ $toc[-1] }, $text;
        $self->handler( text => undef );

print STDERR 'Enter the URL: ';
chomp( my $url = <> );
my ( $volume, $issue ) = ( split m(/), $url )[ -2, -1 ];

my $p = 'HTML::Parser'->new(
    api_version => 3,
    start_h     => [ \&get_date, 'self, tagname, attr' ],

my $mech = 'WWW::Mechanize'->new( agent => 'Mozilla' );
$mech->get( $url );
my $contents = $mech->content;
$p->parse( $contents );

my $toc;
for my $section ( @toc ) {
    $toc .= "<TocSection>\n";
    $toc .= "<Heading>" . shift( @$section ) . "</Heading>\n";
    $toc .= join q(), map "<DOI>$_</DOI>\n", @$section;
    $toc .= "</TocSection>\n";

open( F6, ">meta_issue_$issue.xml" );

print F6 <<"__HTML__";
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="$volume" issue="$issue">

1 个答案:

答案 0 :(得分:3)


但是还存在许多其他问题,例如在LWP::Simple完成时只使用WWW::Mechanize获取网页。并为'span' eq $tag



注意 #/这样的评论只是为了说服Stack Overflow语法高亮显示器正确地为文本着色。您应该在实时代码中删除它们

use strict;
use warnings 'all';

use LWP::Simple 'get';
use HTML::Parser;

my ( $date, $first_page, $last_page, @toc );

print 'Enter the URL: ';
my $url = <>;
$url ||= 'http://ajpheart.physiology.org/content/309/11';
chomp $url;

my ( $volume, $issue ) = ( split m(/), $url )[ -2, -1 ];  #/

my $p = 'HTML::Parser'->new(
    api_version => 3,
    start_h     => [ \&get_span_div, 'self, tagname, attr' ],

my $contents = get($url);
$p->parse( $contents );

my $toc = '';
for my $section ( @toc ) {
    $toc .= "\n";
    $toc .= "    <TocSection>\n";
    $toc .= "      <Heading>" . shift( @$section ) . "</Heading>\n";
    $toc .= "      <DOI>$_</DOI>\n" for @$section;
    $toc .= "    </TocSection>";

open my $out_fh, '>', "meta_issue_$issue.xml" or die $!;

print  { $out_fh } <<"__HTML__";
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="$volume" issue="$issue">

sub get_span_div {
    my ( $self, $tag, $attr ) = @_;

    my $class = $attr->{class};
    my %class;
    %class = map { $_ => 1 } split ' ', $class if $class;

    if ( $tag eq 'span' ) {

        if ( $class{'highwire-cite-metadata-date'} ) {

            $self->handler( text => \&next_text_to_date, 'self, text' ) unless $date;
        elsif ( $class{'highwire-cite-metadata-pages'} ) {

            if ( not defined $first_page ) {
                $self->handler( text => \&parse_first_page, 'self, text' );
            else {
                $self->handler( text => \&parse_last_page, 'self, text' );
        elsif ( $class{'highwire-cite-metadata-doi'} ) {

            $self->handler( text => \&retrieve_doi, 'self, text' );
    elsif ( $tag eq 'div' ) {

        if ( $class{'issue-toc-section'} ) {
            $self->handler( text => \&next_text_to_toc, 'self, text' );

sub next_text_to_date {
    my ( $self, $text ) = @_;

    ($date = $text) =~ s/^\s+|\s+$//g;  #/
    $self->handler( text => undef );

sub parse_first_page {
    my ( $self, $text ) = @_;

    return unless $text =~ /(\w+)(-\w+)?/;  #/

    $first_page = $1;
    $self->handler( text => undef );

sub parse_last_page {
    my ( $self, $text ) = @_;

    return unless $text =~ /\w+-(\w+)/;  #/

    $last_page = $1;
    $self->handler( text => undef );

sub next_text_to_toc {
    my ( $self, $text ) = @_;

    push @toc, [ $text ];
    $self->handler( text => undef );

sub retrieve_doi {
    my ( $self, $text ) = @_;

    return unless $text =~ /\d+/;  #/

    $text =~ s/^\s+|\s+$//g;
    push @{ $toc[-1] }, $text;
    $self->handler( text => undef );


<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="309" issue="11">
  <IssueDate>December 1, 2015</IssueDate>
      <Heading>CALL FOR PAPERS | Cardiovascular Responses to Environmental Stress</Heading>
      <Heading>CALL FOR PAPERS | Autophagy in the Cardiovascular System</Heading>
      <Heading>CALL FOR PAPERS | Mechanisms of Diastolic Dysfunction in Cardiovascular Disease</Heading>
      <Heading>CALL FOR PAPERS | Small Vessels&ndash;Big Problems: Novel Insights into Microvascular Mechanisms of Diseases</Heading>
      <Heading>CALL FOR PAPERS | Exercise Training in Cardiovascular Disease: Mechanisms and Outcomes</Heading>
      <Heading>CALL FOR PAPERS | Cardiac Regeneration and Repair: Mechanisms and Therapy</Heading>
      <Heading>Vascular Biology and Microcirculation</Heading>
      <Heading>Muscle Mechanics and Ventricular Function</Heading>
      <Heading>Signaling and Stress Response</Heading>
      <Heading>Cardiac Excitation and Contraction</Heading>
      <Heading>Integrative Cardiovascular Physiology and Pathophysiology</Heading>