比较已抓取的网页文字

时间:2011-04-28 00:57:31

标签: php curl web-crawler

首先,我是一个PHP新手所以,如果有更好或更有效的方式来做我想做的事情,请随意指出:)

我遇到了一个旧的PHP脚本,用于抓取网站并检查找到的网页上的响应代码。我已将其修改为执行重复内容检查。它使用similar_text函数将1页的内容(由用户指定)与它找到的每个页面的内容进行比较。

它有点慢,但它的工作。我遇到的唯一问题是它在前10个链接后停止,我无法弄清楚原因。我提前道歉,我知道这是相当多的代码。非常感谢任何帮助。

<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">       
<div class="row"><label for="page1" class="small label"><strong>Page? </strong>: </label><input type="text" name="page1" id="page1" value="" size="40" /></div>         
<div class="row"><label for="url" class="small label"><strong>Please Enter URL </strong>: </label><input type="text" name="url" id="url" value="" size="40" /></div>
<div class="row"><label for="maxlinks" class="small label"><strong>Number of links to get </strong>: </label><input type="text" name="maxlinks" id="maxlinks" value="25" size="3"  maxlength="3" /></div>
<div class="row"><label for="linkdepth" class="small label"><strong>Links Maximum depth</strong> : </label> <select name="linkdepth" id="linkdepth" ><option value="1">1</option>
<option value="2" selected="selected">2</option>
<option value="3">3</option>
<option value="4">4</option>
<option value="5">5</option>
<option value="6">6</option>
</select></div> 
<input type="submit" name="submit" style="font-weight: bold" value="Check links" id="submit" />
</form>
<?php 
if (isset($_POST['submit'])){
    $page1 = ($_POST['page1']);
    $baseurl = ($_POST['url']);
    $pages = array();
    $i=($_POST['linkdepth']);
    $maxlinks = (integer)$_POST['maxlinks'];

$domain= extract_domain_name($baseurl); 
echo '<p class="small">Extracted domain name: <strong>'.$domain.'</strong>. ';
echo 'Maximum depth: <strong>'.$i.'</strong></p>';
function get_urls($page){
    global  $domain, $i;

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $page);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_HEADER, true);
    /* Spoof the User-Agent header value; just to be safe */
    curl_setopt($ch, CURLOPT_USERAGENT, 
      'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
    /* I set timeout values for the connection and download
    because I don't want my script to get stuck 
    downloading huge files or trying to connect to 
    a nonresponsive server. These are optional. */
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100);
    curl_setopt($ch, CURLOPT_TIMEOUT, 100);
    /* This ensures 404 Not Found (and similar) will be 
    treated as errors */
    curl_setopt($ch, CURLOPT_FAILONERROR, 0);

    /* Download the page */
    $html = curl_exec($ch);
  /* in case of an error*/  
    if(curl_exec($ch) === false)
        {
        echo '<p class="small">Error. Please check URL: <strong style="color:#ae3100">' . curl_error($ch).'</p></strong>';
        }

    curl_close($ch);

    if(!$html)   return false;
    /* Extract the BASE tag (if present) for
      relative-to-absolute URL conversions later */
        if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){

        $base_url=$matches[1];
        echo $base_url;
            } else {
                    $base_url=$page; //base url = strani4ka s kotoroy na4inaetsa novaja porverka
                    }
            $links=array();
            $html = str_replace("\n", ' ', $html);


            preg_match_all('/<a[\s]+[^>]*href\s*=\s*[\"\']?([^\'\" >]+)[\'\" >]/i', $html, $m);
        /* this regexp is a combination of numerous 
            versions I saw online*/
                foreach($m[1] as $url) {
                $url=trim($url);
                /* get rid of PHPSESSID, #linkname, & and javascript: */
                $url=preg_replace(
                    array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i'),
                    array('','','&',''),
                    $url);

                /* turn relative URLs into absolute URLs. 
                  relative2absolute() is defined further down 
                  below on this page. */

                  $url =  relative2absolute($base_url, $url);

                     // check if in the same (sub-)$domain
                if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url)) 
                {
                $depth= substr_count($url, "/")-2 ; 

                /* Counts slashes in URL
                Responsible for link depth
                */

        if ($depth <= $i){

            if(!in_array($url, $links, check))  $links[]=$url; 
                }  } 
        } 

     return $links; 

}  

// Functions to crawl the next page
function next_page(){
    global $pages;
$k=0;
    foreach( array_keys($pages) as $k=> $page){

        if($pages[$page] == NULL){
            $k++;

            echo "[$k] - ";
            return $page;
        }
    }
    return NULL;
}

function add_urls($page){ // ads new unique urls in to array and checks each url for Server Header Status
    global $pages, $maxlinks;

    $start = microtime();
    $urls = get_urls($page);
    $resptime = microtime() - $start; // with microtime it is possible to find out on which page the crowler stops responding.

    //Start checking for Server Header
    $ch = curl_init($page);
    curl_setopt($ch, CURLOPT_NOBODY, 1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

    // Execute
    curl_exec($ch);
    $info = curl_getinfo($ch);

    print "$page";

// If the status code os 200, then print OK, else = NO
//       if($info['http_code']==200) {
$page1 = ($_POST['page1']);
$page1data = file_get_contents($page1);
$page2 = file_get_contents($page);

$i = similar_text($page1data, $page2, $p);
$p = round($p, 2);

        echo ' -  Match Percentage:' . $p . '%';
//      } else {
//               echo '<strong style="color:#ba3d00"> NO </strong>';} 

            /* echo substr(($resptime),0,5). " seconds"; */ // Activate ths to see how much time it takes to crawl
            echo '<br/>';

        curl_close($ch); // Close handle

    $pages[$page] = array ('resptime' => floor($resptime * 9000), 'url' => $page);

    foreach($urls as $url){
        if(!array_key_exists($url, $pages)  && !in_array($url, $pages) && count($pages)<$maxlinks){
            $pages[$url] = NULL;
        } 

    }

}

echo '[1] - '; // this is for the first input url, as it will be extracted from input
add_urls($baseurl);

while(($page= next_page())  != NULL ) //while there are urls available


{
add_urls($page);

}   

    echo '<p class="small">Amount of crawled links: <strong>'.count ($pages).'</strong></p>'; 
    if (count($pages)<$maxlinks) echo '<p class="small">Sorry, no more links to crawl!!</p>';// count all extracted Urls
}

?><?php 
function extract_domain_name($url){
    /* old domain extractor 
    if(preg_match('@^(?:http:\/\/)?([^\/]+)@i', $url, $matches)) {
        return trim(strtolower($matches[1]));
    } else {
        return '';
    }*/
    preg_match("/^(http:\/\/)?([^\/]+)/i", $url, $matches);
    $host = $matches[2];
    // get last two segments of host name
    preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches);
    return $matches[0];

}

function relative2absolute($absolute, $relative) {
$p = parse_url($relative);
if($p["scheme"])return $relative;
extract(parse_url($absolute));
$path = dirname($path);
if($relative{0} == '/')
{
$newPath = array_filter(explode("/", $relative));
}
else
{
$aparts = array_filter(explode("/", $path));
$rparts = array_filter(explode("/", $relative));
$cparts = array_merge($aparts, $rparts);
$k = 0;
$newPath = array();
foreach($cparts as $i => $part)
{
if($part == '..')
{
$k = $k - 1;
$newPath[$k] = null;
}
else
{
$newPath[$k] = $cparts[$i];
$k = $k + 1;
}
}
$newPath = array_filter($newPath);
}
$path = implode("/", $newPath);
$url = "";
if($scheme)
{
$url = "$scheme://";
}
if($user)
{
$url .= "$user";
if($pass)
{
$url .= ":$pass";
}
$url .= "@";
}
if($host)
{
$url .= "$host/";
}
$url .= $path;
return $url;
} 

##################################################

1 个答案:

答案 0 :(得分:1)

如果它恰好在约30秒后停止。将以下内容添加到脚本顶部:set_time_limit(0);

通常PHP脚本会在30秒后被终止,但您可以将其覆盖。