Question

您好我希望处理网址数组。如果有一个问题，必须记录在errorfile.html中并继续处理其他网址。（网址无法加载或xpath失败错误）必须被记录在错误日志中。得到错误“无法调用方法”isa“在一个未识别的”

  use LWP::Simple;
use File::Compare;
use HTML::TreeBuilder::XPath;
use LWP::UserAgent;



{
open(FILE, "C:/Users/jeyakuma/Desktop/shipping project/input/input.txt");  

{

while(<FILE>)
    {                   
   chomp;
   $url=$_;
   foreach ($url)
    {
    ($domain) = $url =~ m|www.([A-Z a-z 0-9]+.{3}).|x;
    }


do 'C:/Users/jeyakuma/Desktop/perl/mainsub.pl';
&domain_check();



        my $ua = LWP::UserAgent->new( agent => "Mozilla/5.0" );
        my $req = HTTP::Request->new( GET => "$url" );
        my $res = $ua->request($req);
        if ( $res->is_success ) 


        {

                print "working on $competitor\n";

                binmode ":utf8";
                my $xp = HTML::TreeBuilder::XPath->new_from_url($url);
                print "Extracting the $competitor xpath\n";
                my @node = $xp->findnodes_as_string("$xpath") or print "couldn't find the node\n";

                open HTML, '>:encoding(cp1252)',"C:/Users/jeyakuma/Desktop/die/$competitor.html";

                foreach(<@node>)
                {
                print HTML @node;
                close HTML ;
                }

        }
        else{  
                print "In valid url";

        }
}


}
}

Answer 1

I wish to process array of urls

然后修改脚本以在数组上使用循环。

像

这样的东西

foreach my $url (@URLS){
    #work on $url here
    my $xp  = HTML::TreeBuilder::XPath->new_from_url($url);
    my @node = $xp->findnodes_as_strings('//div[@class="mainbox-body"]');
    #don't use die, instead record error message in file.
    print $error_log "node doesn't exist" unless @node; 
    #do other tasks for url
}

编辑：使用下面的代码，它对我来说很好。你脚本中的$xpath是什么？这是给你isa错误的部分（你在评论中提到）

#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
use HTML::TreeBuilder::XPath;
use LWP::UserAgent;
#You can read URLS from file and create array, I'm doing directly for simplicity
my @urls = ("http://www.google.com", "http://www.yahoo.com");
foreach my $url (@urls){
        print "working on $url\n";
        my $ua = LWP::UserAgent->new( agent => "Mozilla/5.0" );
        my $req = HTTP::Request->new( GET => "$url" );
        my $res = $ua->request($req);
        if ( $res->is_success ) {
                print "In if block, success\n";
                my $xp = HTML::TreeBuilder::XPath->new_from_url($url);
                my $node = $xp->findnodes_as_string('//div[@class="mainbox-body"]') or print "couldn't find the node\n";
        }
        else{  
                print "In else block\n";
        }
}

在日志中打印错误并继续抓取其他网址

1 个答案: