我正在使用DOMDocument()来加载html页面的文本。
加载页面需要很长时间。这是否意味着它也会下载图像?
更快地从网址加载html网页的任何替代方案或解决方案?
我使用DOMDocument()基本上提取元描述,标题文本,正文等。
使用工作代码的解决方案将受到高度赞赏。
<?php
set_time_limit(0);
include "connection.php";
error_reporting(E_ERROR | E_PARSE);
// Create a document instance
$doc = new DOMDocument();
if(!isset($_GET['url'])){
$_GET['url']=$urlfromdaemon;
$silentcrawl="set";
}
$doc->loadHTMLFile($_GET['url']);
$base_url=$_GET['url'];
$base_url = parse_url($base_url);
$base_url = 'http://'.$base_url['host'].'/';
//Searches for all elements with the "a" tag name
$tit = $doc->getElementsByTagName( "a" );
$urlarray=array();
$t=0;
foreach($tit AS $x){
$urlarray[$t]=$x->getAttribute('href');
$urlanchor[$t]=$x->nodeValue;
$t++;
}
//This makes the URL with spaces work correctly
for($i=0;$i<count($urlarray);$i++){
$urlarray[$i]= str_ireplace(" ","%20",$urlarray[$i]);
}
//
for($i=0;$i<count($urlarray);$i++){
$result=stristr(substr($urlarray[$i], 0, 7),"http://");
if($result==''){
if(stristr(substr($urlarray[$i], 0, 8),"https://")!=''){
}
else if(stristr(substr($urlarray[$i], 0, 2),"//")!=''){
$urlarray[$i]= 'http:'.$urlarray[$i];
}
else if(stristr(substr($urlarray[$i], 0, 4),"www.")==''){
//critical code section
$urlcheck='http://'.$urlarray[$i];
$headers = @get_headers($urlcheck, 1);
if ($headers === FALSE) { //Test for differentiate example.com with example .
if(substr($_GET['url'],-1)=='/'){
$urlarray[$i]= $_GET['url'].$urlarray[$i];
}
else{
if(parse_url($_GET['url'], PHP_URL_PATH)=='/'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}
else if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-1)=='/'){
if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-7)=='http://'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
}
}
else {
$urlarray[$i]= 'http://'.$urlarray[$i];
}
//
}
else{
$urlarray[$i]='http://'.$urlarray[$i];
}
}
}
for($i=0;$i<count($urlarray);$i++){
$file = $urlarray[$i];
$file_headers = @get_headers($file);
if($file_headers[0] =='HTTP/1.1 404 Not Found') {
if(!isset($silentcrawl)){
//print_r($file_headers);
echo '<img style="width:20px;height:20px;float:left;" src="cross.png" > '.$urlarray[$i].'<br><Br>';
}
}
else {
if(!isset($silentcrawl)){
echo '<img style="width:20px;height:20px;float:left;" src="tick.png" > '.$urlarray[$i].'<br><br>';
}
//Insert Active Links into the database
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool)
VALUES ('$urlarray[$i]','$_GET[url]','$urlanchor[$i]')";
mysqli_query($con,$sql);
}
else{
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
while($row=mysqli_fetch_array($res)){
$referers=explode(" ",$row['referer']);
$refcount=0;
for($j=0;$j<count($referers);$j++){
if($_GET['url']==$referers[$j]){
$refcount++;
//echo "same referer";
}
if($_GET['url']==$urlarray[$i]){
$refcount++;
//echo "same referer";
}
}
if($refcount<1){
$newreferer=$row['referer']." ".$_GET['url'];
$sql="update links set referer='$newreferer' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
$anchor=$row['anchor_pool'].' '.$urlanchor[$i];
$anchors=explode(" ",$anchor);
$anchors=array_unique($anchors);
$anchors=array_values($anchors);
$final_anchor=implode(' ',$anchors);
$sql="update links set anchor_pool='$final_anchor' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
}
}
}
}
}
$errors = array_filter($urlarray);
if (!empty($errors)) {
}
else{
echo "Either the URL is down or page contains no Links !, Try entering URL along with protocol used.";
}
$prime=$_GET['url'];
$res=mysqli_query($con,"SELECT * from links where url='$prime' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool,backlinks,status)
VALUES ('$_GET[url]','','','0','1')";
mysqli_query($con,$sql);
}
else{
$file_headers = @get_headers($prime);
if($file_headers[0] == 'HTTP/1.1 200 OK') {
$sql="update links set status='1' where url='$prime' ";
mysqli_query($con,$sql);
}
}
$res=mysqli_query($con,"SELECT * from links ");
while($row=mysqli_fetch_array($res)){
$bkarray=array_filter(explode(" ",$row['referer']));
for($i=0;$i<count($bkarray);$i++){
$base_url=parse_url($bkarray[$i]);
$bkarray[$i]=$base_url['host'];
}
$bkarray=array_unique($bkarray);
$bkarray=array_values($bkarray);
$bkarray=array_filter($bkarray);
$bk=count($bkarray);
$sql="update links set backlinks='$bk' where url='$row[url]' ";
mysqli_query($con,$sql);
}
?>
答案 0 :(得分:0)
DOMDocument类是一个HTML / XML解析器。周期。
您尚未共享的代码可能使用PHP流包装器通过HTTP使用与加载本地文件相同的语法透明地下载远程资源。这是一项完全不同的任务。据我所知,PHP没有将功能齐全的Web爬虫作为其内置库的一部分捆绑。
修改:完成下载的地点:
$doc->loadHTMLFile($_GET['url']);
此行之后的所有内容都不是由下载问题引起的。