我似乎无法使用这些代码:
$self->{_current_page} = $href;
my $response = $ua->get($href);
my $responseCode = $response->code;
if( $responseCode ne "404" ) {
my $content = LWP::Simple->get($href);
die "get failed: " . $href if (!defined $content);
}
将返回错误:get failed: http://www.google.com
完整代码如下:
#!/usr/bin/perl
use strict;
use URI;
use URI::http;
use File::Basename;
use DBI;
use LWP::Simple;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);
package Crawler;
sub new {
my $class = shift;
my $self = {
_url => shift,
_max_link => 0,
_local => 1
};
bless $self, $class;
return $self;
}
sub trim{
my( $self, $string ) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub process_image {
my ($self, $process_image) = @_;
$self->{_process_image} = $process_image;
}
sub local {
my ($self, $local) = @_;
$self->{_local} = $local;
}
sub max_link {
my ($self, $max_link) = @_;
$self->{_max_link} = $max_link;
}
sub x_more {
my ($self, $x_more) = @_;
$self->{_x_more} = $x_more;
}
sub resolve_href {
my ($base, $href) = @_;
my $uri = URI->new($href);
return $uri->rel($base);
}
sub write {
my ( $self, $ref, $data ) = @_;
open FILE, '>c:/perlscripts/' . $ref . '_' . $self->{_process_image} . '.txt';
foreach( $data ) {
print FILE $self->trim($_) . "\n";
}
close( FILE );
}
sub scrape {
my @m_error_array;
my @m_href_array;
my @href_array;
my ( $self, $DBhost, $DBuser, $DBpass, $DBname ) = @_;
my ($dbh, $query, $result, $array);
my $DNS = "dbi:mysql:$DBname:$DBhost:3306";
$dbh = DBI->connect($DNS, $DBuser, $DBpass ) or die $DBI::errstr;
if( defined( $self->{_process_image} ) && ( -e 'c:/perlscripts/href_w_' . $self->{_process_image} . ".txt" ) ) {
open ERROR_W, "<c:/perlscripts/error_w_" . $self->{_process_image} . ".txt";
open M_HREF_W, "<c:/perlscripts/m_href_w_" . $self->{_process_image} . ".txt";
open HREF_W, "<c:/perlscripts/href_w_" . $self->{_process_image} . ".txt";
@m_error_array = <ERROR_W>;
@m_href_array = <M_HREF_W>;
@href_array = <HREF_W>;
close ( ERROR_W );
close ( M_HREF_W );
close ( HREF_W );
}else{
@href_array = ( $self->{_url} );
}
my $z = 0;
while( @href_array ){
if( defined( $self->{_x_more} ) && $z == $self->{_x_more} ) {
last;
}
if( defined( $self->{_process_image} ) ) {
$self->write( 'm_href_w', @m_href_array );
$self->write( 'href_w', @href_array );
$self->write( 'error_w', @m_error_array );
}
$self->{_link_count} = scalar @m_href_array;
my $href = shift( @href_array );
my $info = URI::http->new($href);
my $host = $info->host;
$host =~ s/^www\.//;
$result = $dbh->prepare("INSERT INTO `". $host ."` (URL) VALUES ('$href')");
if( ! $result->execute() ){
$result = $dbh->prepare("CREATE TABLE `" . $host . "` ( `ID` INT( 255 ) NOT NULL AUTO_INCREMENT , `URL` VARCHAR( 255 ) NOT NULL , PRIMARY KEY ( `ID` )) ENGINE = MYISAM ;");
$result->execute();
}
$self->{_current_page} = $href;
my $response = $ua->get($href);
my $responseCode = $response->code;
if( $responseCode ne "404" ) {
my $content = LWP::Simple->get($href);
die "get failed: " . $href if (!defined $content);
}
#print $responseCode;
}
}
1;
#$query = "SELECT * FROM `actwebdesigns.co.uk` ORDER BY ID DESC";
#$result = $dbh->prepare($query);
#$result->execute();
#while( $array = $result->fetchrow_hashref() ) {
# print $array->{'URL'} . "\n";
#}
编辑:
仍未使用重定向修复。
my $redirect_limit = 10;
my $y = 0;
while( 1 && $y le $redirect_limit ) {
my $response = $ua->get($href);
my $responseCode = $response->code;
if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) {
if( $responseCode == 301 || $responseCode == 302 ) {
$href = $response->header('Location');
}else{
last;
}
}else{
push( @m_error_array, $href );
last;
}
$y++;
}
if( $y ne $redirect_limit ) {
if( ! defined( $self->{_url_list} ) ) {
my @url_list = ( $href );
}else{
my @url_list = $self->{_url_list};
push( @url_list, $href );
$self->{_url_list} = @url_list;
}
my $content = LWP::Simple->get($href);
die "get failed: " . $href if (!defined $content);
#$result = $dbh->prepare("INSERT INTO `". $host ."` (URL) VALUES ('$href')");
#if( ! $result->execute() ){
# $result = $dbh->prepare("CREATE TABLE `" . $host . "` ( `ID` INT( 255 ) NOT NULL AUTO_INCREMENT , `URL` VARCHAR( 255 ) NOT NULL , PRIMARY KEY ( `ID` )) ENGINE = MYISAM ;");
# $result->execute();
#}
print "good";
}else{
push( @m_error_array, $href );
}
答案 0 :(得分:2)
您应该检查响应代码以查看发生了什么(您已经在检查404)。我得到302 - 重定向。
例如:
die "get failed ($responseCode): " . $href if (!defined $content);
结果消息:
get failed (302): http://www.google.com at goog.pl line 20.
答案 1 :(得分:2)
一些想法。
1 /您似乎正在使用字符串比较运算符(le,ne)来比较数字。您应该使用数字比较运算符(&lt; =,!=)。
2 /您从LWP::UserAgent::get调用中获得的值是HTTP::Response对象。明智地使用该类的“is_foo”方法可能会使您的代码更清晰。
我不知道其中任何一个是否能解决您的问题。但它们会提高代码的质量。
答案 2 :(得分:1)
这是你的问题:
my $content = LWP::Simple->get($href);
将字符串“LWP :: Simple”作为'get'的第一个参数传递。你想要:
my $content = LWP::Simple::get($href);
答案 3 :(得分:-1)
检查您的SELinux设置。
启用SELINUX的系统不允许来自Web代理(httpd)的传出连接。
此页面可以告诉您有关SELinux和HTTPD设置的更多信息: http://wiki.centos.org/TipsAndTricks/SelinuxBooleans
在Perl脚本中启用Apache的出站Web连接:
# setsebool -P httpd_can_network_connect on