Question

我正在使用DOMDocument（）来加载html页面的文本。

加载页面需要很长时间。这是否意味着它也会下载图像？

更快地从网址加载html网页的任何替代方案或解决方案？

我使用DOMDocument（）基本上提取元描述，标题文本，正文等。

使用工作代码的解决方案将受到高度赞赏。

<?php
set_time_limit(0);
include "connection.php";

error_reporting(E_ERROR | E_PARSE);
 // Create a document instance 
  $doc = new DOMDocument();
  if(!isset($_GET['url'])){
  $_GET['url']=$urlfromdaemon;
  $silentcrawl="set";
  }



$doc->loadHTMLFile($_GET['url']);
  $base_url=$_GET['url'];
  $base_url = parse_url($base_url);
   $base_url = 'http://'.$base_url['host'].'/';

  //Searches for all elements with the "a" tag name
  $tit = $doc->getElementsByTagName( "a" );
  $urlarray=array();
  $t=0;
  foreach($tit AS $x){
  $urlarray[$t]=$x->getAttribute('href');
  $urlanchor[$t]=$x->nodeValue;
  $t++;
}
//This makes the URL with spaces work correctly 
for($i=0;$i<count($urlarray);$i++){
$urlarray[$i]= str_ireplace(" ","%20",$urlarray[$i]);
}
//

for($i=0;$i<count($urlarray);$i++){
$result=stristr(substr($urlarray[$i], 0, 7),"http://");
if($result==''){

if(stristr(substr($urlarray[$i], 0, 8),"https://")!=''){

}

else if(stristr(substr($urlarray[$i], 0, 2),"//")!=''){
$urlarray[$i]= 'http:'.$urlarray[$i];
}

else if(stristr(substr($urlarray[$i], 0, 4),"www.")==''){
//critical code section

$urlcheck='http://'.$urlarray[$i];
$headers = @get_headers($urlcheck, 1);
if ($headers === FALSE) { //Test for  differentiate example.com with example .

if(substr($_GET['url'],-1)=='/'){
$urlarray[$i]= $_GET['url'].$urlarray[$i];
}
else{
if(parse_url($_GET['url'], PHP_URL_PATH)=='/'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];

}
else if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-1)=='/'){

if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-7)=='http://'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}

}

else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
}

}
else {

$urlarray[$i]= 'http://'.$urlarray[$i];

}



//
}



else{
$urlarray[$i]='http://'.$urlarray[$i];

}

}

}

for($i=0;$i<count($urlarray);$i++){

       $file = $urlarray[$i];
$file_headers = @get_headers($file);

if($file_headers[0] =='HTTP/1.1 404 Not Found') {
        if(!isset($silentcrawl)){
        //print_r($file_headers);
     echo '<img style="width:20px;height:20px;float:left;" src="cross.png" > '.$urlarray[$i].'<br><Br>';
     }
}
else {
if(!isset($silentcrawl)){
    echo '<img style="width:20px;height:20px;float:left;" src="tick.png" > '.$urlarray[$i].'<br><br>';
    }
    //Insert Active Links into the database 
    $res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool) 
VALUES ('$urlarray[$i]','$_GET[url]','$urlanchor[$i]')";
mysqli_query($con,$sql);
}
else{
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
while($row=mysqli_fetch_array($res)){
$referers=explode(" ",$row['referer']);
$refcount=0;
for($j=0;$j<count($referers);$j++){
if($_GET['url']==$referers[$j]){
$refcount++;
//echo "same referer";
}
if($_GET['url']==$urlarray[$i]){
$refcount++;
//echo "same referer";
}
}
if($refcount<1){
$newreferer=$row['referer']." ".$_GET['url'];
$sql="update links set referer='$newreferer' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);

$anchor=$row['anchor_pool'].' '.$urlanchor[$i];
$anchors=explode(" ",$anchor);
$anchors=array_unique($anchors);
$anchors=array_values($anchors);
$final_anchor=implode(' ',$anchors);

$sql="update links set anchor_pool='$final_anchor' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
}


}


}


}
}

$errors = array_filter($urlarray);

if (!empty($errors)) {

}
else{
echo "Either the URL is down or page contains no Links !, Try entering URL along with protocol used.";
}


$prime=$_GET['url'];
$res=mysqli_query($con,"SELECT * from links where url='$prime' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool,backlinks,status) 
VALUES ('$_GET[url]','','','0','1')";
mysqli_query($con,$sql);
}
else{
$file_headers = @get_headers($prime);
if($file_headers[0] == 'HTTP/1.1 200 OK') {
$sql="update links set status='1' where url='$prime' ";
mysqli_query($con,$sql);
}
}


$res=mysqli_query($con,"SELECT * from links ");
while($row=mysqli_fetch_array($res)){

$bkarray=array_filter(explode(" ",$row['referer']));
for($i=0;$i<count($bkarray);$i++){
$base_url=parse_url($bkarray[$i]);
$bkarray[$i]=$base_url['host'];
}

$bkarray=array_unique($bkarray);
$bkarray=array_values($bkarray);
$bkarray=array_filter($bkarray);

$bk=count($bkarray);
$sql="update links set backlinks='$bk' where url='$row[url]' ";
mysqli_query($con,$sql);

}

?>

Answer 1

DOMDocument类是一个HTML / XML解析器。周期。

您尚未共享的代码可能使用PHP流包装器通过HTTP使用与加载本地文件相同的语法透明地下载远程资源。这是一项完全不同的任务。据我所知，PHP没有将功能齐全的Web爬虫作为其内置库的一部分捆绑。

修改：完成下载的地点：

$doc->loadHTMLFile($_GET['url']);

此行之后的所有内容都不是由下载问题引起的。

DOMDocument（）花费太多时间来加载页面

1 个答案: