我在php上编写了一个脚本,以检查网站中的外部链接是否无效 这是sript
<?php
// It may take a whils to spider a website ...
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include_once('../PHPCrawl_083/PHPCrawl_083/libs/PHPCrawler.class.php');
include ('check.php');
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo(PHPCrawlerDocumentInfo $DocInfo) {
if (PHP_SAPI == "cli") $lb = "\n";
else {
$lb = "<br />";
// Print the URL and the HTTP-status-Code
// Print the refering URL
$file = file_get_contents($DocInfo->url);
preg_match_all('/<a[^>]+href="([^"]+)/i', $file, $urls);
echo '<br/>';
$home_url = parse_url( $_SERVER['HTTP_HOST'] );
foreach($urls as $url){
for($i=0;$i<sizeof($url);$i++){
$link_url = parse_url( $url[$i] );
if( $link_url['host'] != $home_url['host'] ) {
if (check_url($url[$i])=== false){
echo " Page requested: ".$DocInfo->url." (".$DocInfo->http_status_code.")".$lb;
echo '<br/>';
echo "<font color=green >"."lien externe invalide :".$url[$i].$lb." </font>";
echo '<br/>';
}
}
}
}
}
}
}
$crawler = new MyCrawler();
$crawler->setURL("http://www.tunisie-web.org ");
$crawler->addURLFilterRule("#\.(jpg|gif|png|pdf|jpeg|css|js)$# i");
$crawler->setWorkingDirectory("C:/Users/mayss/Documents/travailcrawl/");
$crawler->go();
?>
但不仅仅是外部链接(不是无效的:/),它提供了&#34; http://www.tunisie-web.org&#34;作为一个关键的链接,我不知道问题在哪里! 请帮忙 这是check.php:
<?php
function check_url($url) {
if ( !filter_var($url, FILTER_VALIDATE_URL,FILTER_FLAG_QUERY_REQUIRED) === false) {
return true ;
}
else {
return false;
}
}
?>