检索在线数据并生成xml文件

时间:2015-12-12 04:41:15

标签: perl perl-module

每当我运行以下Perl脚本时,我都会遇到以下错误

Use of uninitialized value $date in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
Use of uninitialized value $first_page in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1.
Use of uninitialized value $last_page in concatenation (.) or string at D:\sagar\toc\Online_TOC.pl line 111, <> line 1. 

通过提供URL

在命令prmpt上运行以下代码
http://ajpheart.physiology.org/content/309/11

它生成meta_issue11.xml文件,但没有提供正确的输出。

#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };

use HTML::Parser;
use WWW::Mechanize;

my ( $date, $first_page, $last_page, @toc );

sub get_date {
    my ( $self, $tag, $attr ) = @_;

    if ( 'span' eq $tag
        and $attr->{class}
        and 'highwire-cite-metadata-date' eq $attr->{class}
        and not defined $date )
    {

        $self->handler( text => \&next_text_to_date, 'self, text' );

    }
    elsif ( 'span' eq $tag
        and $attr->{class}
        and 'highwire-cite-metadata-pages' eq $attr->{class} )
    {
        if ( not defined $first_page ) {
            $self->handler( text => \&parse_first_page, 'self, text' );
        }
        else {
            $self->handler( text => \&parse_last_page, 'self, text' );
        }

    }
    elsif ( 'span' eq $tag
        and $attr->{class}
        and 'highwire-cite-metadata-doi' eq $attr->{class} )
    {
        $self->handler( text => \&retrieve_doi, 'self, text' );

    }
    elsif ( 'div' eq $tag
        and $attr->{class}
        and $attr->{class} =~ /\bissue-toc-section\b/ )
    {
        $self->handler( text => \&next_text_to_toc, 'self, text' );
    }
}

sub next_text_to_date {
    my ( $self, $text ) = @_;

    $text =~ s/^\s+|\s+$//g;
    $date = $text;
    $self->handler( text => undef );
}

sub parse_first_page {
    my ( $self, $text ) = @_;

    if ( $text =~ /([A-Z0-9]+)(?:-[0-9A-Z]+)?/ ) {
        $first_page = $1;
        $self->handler( text => undef );
    }
}

sub parse_last_page {
    my ( $self, $text ) = @_;

    if ( $text =~ /(?:[A-Z0-9]+-)?([0-9A-Z]+)/ ) {
        $last_page = $1;
        $self->handler( text => undef );
    }
}

sub next_text_to_toc {
    my ( $self, $text ) = @_;

    push @toc, [$text];
    $self->handler( text => undef );
}

sub retrieve_doi {
    my ( $self, $text ) = @_;

    if ( 'DOI:' ne $text ) {
        $text =~ s/^\s+|\s+$//g;
        push @{ $toc[-1] }, $text;
        $self->handler( text => undef );
    }
}

print STDERR 'Enter the URL: ';
chomp( my $url = <> );
my ( $volume, $issue ) = ( split m(/), $url )[ -2, -1 ];

my $p = 'HTML::Parser'->new(
    api_version => 3,
    start_h     => [ \&get_date, 'self, tagname, attr' ],
);

my $mech = 'WWW::Mechanize'->new( agent => 'Mozilla' );
$mech->get( $url );
my $contents = $mech->content;
$p->parse( $contents );
$p->eof;

my $toc;
for my $section ( @toc ) {
    $toc .= "<TocSection>\n";
    $toc .= "<Heading>" . shift( @$section ) . "</Heading>\n";
    $toc .= join q(), map "<DOI>$_</DOI>\n", @$section;
    $toc .= "</TocSection>\n";
}

open( F6, ">meta_issue_$issue.xml" );

print F6 <<"__HTML__";
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="$volume" issue="$issue">
<Provider>Cadmus</Provider>
<IssueDate>$date</IssueDate>
<PageRange>$first_page-$last_page</PageRange>
<TOC>$toc</TOC>
</MetaIssue>
__HTML__

1 个答案:

答案 0 :(得分:3)

主要问题是您正在检查相等的类字符串,而所需的类可能只是几个以空格分隔的类名之一

但是还存在许多其他问题,例如在LWP::Simple完成时只使用WWW::Mechanize获取网页。并为'span' eq $tag

检查三次

这是一个有效的版本。我更希望看到XML::Writer用于创建输出XML,但我一直使用简单的print语句,就像在您自己的代码中一样

注意 #/这样的评论只是为了说服Stack Overflow语法高亮显示器正确地为文本着色。您应该在实时代码中删除它们

#!/usr/bin/perl
use strict;
use warnings 'all';

use LWP::Simple 'get';
use HTML::Parser;

my ( $date, $first_page, $last_page, @toc );

print 'Enter the URL: ';
my $url = <>;
$url ||= 'http://ajpheart.physiology.org/content/309/11';
chomp $url;

my ( $volume, $issue ) = ( split m(/), $url )[ -2, -1 ];  #/

my $p = 'HTML::Parser'->new(
    api_version => 3,
    start_h     => [ \&get_span_div, 'self, tagname, attr' ],
);

my $contents = get($url);
$p->parse( $contents );
$p->eof;

my $toc = '';
for my $section ( @toc ) {
    $toc .= "\n";
    $toc .= "    <TocSection>\n";
    $toc .= "      <Heading>" . shift( @$section ) . "</Heading>\n";
    $toc .= "      <DOI>$_</DOI>\n" for @$section;
    $toc .= "    </TocSection>";
}

open my $out_fh, '>', "meta_issue_$issue.xml" or die $!;

print  { $out_fh } <<"__HTML__";
<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="$volume" issue="$issue">
  <Provider>Cadmus</Provider>
  <IssueDate>$date</IssueDate>
  <PageRange>$first_page-$last_page</PageRange>
  <TOC>$toc
  </TOC>
</MetaIssue>
__HTML__
#/

sub get_span_div {
    my ( $self, $tag, $attr ) = @_;

    my $class = $attr->{class};
    my %class;
    %class = map { $_ => 1 } split ' ', $class if $class;

    if ( $tag eq 'span' ) {

        if ( $class{'highwire-cite-metadata-date'} ) {

            $self->handler( text => \&next_text_to_date, 'self, text' ) unless $date;
        }
        elsif ( $class{'highwire-cite-metadata-pages'} ) {

            if ( not defined $first_page ) {
                $self->handler( text => \&parse_first_page, 'self, text' );
            }
            else {
                $self->handler( text => \&parse_last_page, 'self, text' );
            }
        }
        elsif ( $class{'highwire-cite-metadata-doi'} ) {

            $self->handler( text => \&retrieve_doi, 'self, text' );
        }
    }
    elsif ( $tag eq 'div' ) {

        if ( $class{'issue-toc-section'} ) {
            $self->handler( text => \&next_text_to_toc, 'self, text' );
        }
    }
}

sub next_text_to_date {
    my ( $self, $text ) = @_;

    ($date = $text) =~ s/^\s+|\s+$//g;  #/
    $self->handler( text => undef );
}

sub parse_first_page {
    my ( $self, $text ) = @_;

    return unless $text =~ /(\w+)(-\w+)?/;  #/

    $first_page = $1;
    $self->handler( text => undef );
}

sub parse_last_page {
    my ( $self, $text ) = @_;

    return unless $text =~ /\w+-(\w+)/;  #/

    $last_page = $1;
    $self->handler( text => undef );
}

sub next_text_to_toc {
    my ( $self, $text ) = @_;

    push @toc, [ $text ];
    $self->handler( text => undef );
}

sub retrieve_doi {
    my ( $self, $text ) = @_;

    return unless $text =~ /\d+/;  #/

    $text =~ s/^\s+|\s+$//g;
    push @{ $toc[-1] }, $text;
    $self->handler( text => undef );
}

输出

<!DOCTYPE MetaIssue SYSTEM "http://schema.highwire.org/public/toc/MetaIssue.pubids.dtd">
<MetaIssue volume="309" issue="11">
  <Provider>Cadmus</Provider>
  <IssueDate>December 1, 2015</IssueDate>
  <PageRange>H1793-H1996</PageRange>
  <TOC>
    <TocSection>
      <Heading>CALL FOR PAPERS | Cardiovascular Responses to Environmental Stress</Heading>
      <DOI>10.1152/ajpheart.00199.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>CALL FOR PAPERS | Autophagy in the Cardiovascular System</Heading>
      <DOI>10.1152/ajpheart.00709.2014</DOI>
    </TocSection>
    <TocSection>
      <Heading>CALL FOR PAPERS | Mechanisms of Diastolic Dysfunction in Cardiovascular Disease</Heading>
      <DOI>10.1152/ajpheart.00608.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>CALL FOR PAPERS | Small Vessels&ndash;Big Problems: Novel Insights into Microvascular Mechanisms of Diseases</Heading>
      <DOI>10.1152/ajpheart.00463.2015</DOI>
      <DOI>10.1152/ajpheart.00691.2015</DOI>
      <DOI>10.1152/ajpheart.00568.2015</DOI>
      <DOI>10.1152/ajpheart.00653.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>CALL FOR PAPERS | Exercise Training in Cardiovascular Disease: Mechanisms and Outcomes</Heading>
      <DOI>10.1152/ajpheart.00341.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>CALL FOR PAPERS | Cardiac Regeneration and Repair: Mechanisms and Therapy</Heading>
      <DOI>10.1152/ajpheart.00594.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>Vascular Biology and Microcirculation</Heading>
      <DOI>10.1152/ajpheart.00289.2015</DOI>
      <DOI>10.1152/ajpheart.00308.2015</DOI>
      <DOI>10.1152/ajpheart.00179.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>Muscle Mechanics and Ventricular Function</Heading>
      <DOI>10.1152/ajpheart.00284.2015</DOI>
      <DOI>10.1152/ajpheart.00327.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>Signaling and Stress Response</Heading>
      <DOI>10.1152/ajpheart.00050.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>Cardiac Excitation and Contraction</Heading>
      <DOI>10.1152/ajpheart.00055.2015</DOI>
    </TocSection>
    <TocSection>
      <Heading>Integrative Cardiovascular Physiology and Pathophysiology</Heading>
      <DOI>10.1152/ajpheart.00316.2015</DOI>
      <DOI>10.1152/ajpheart.00721.2014</DOI>
    </TocSection>
    <TocSection>
      <Heading>Corrigendum</Heading>
      <DOI>10.1152/ajpheart.H-zh4-1780-corr.2015</DOI>
    </TocSection>
  </TOC>
</MetaIssue>