首先,我是一个PHP新手所以,如果有更好或更有效的方式来做我想做的事情,请随意指出:)
我遇到了一个旧的PHP脚本,用于抓取网站并检查找到的网页上的响应代码。我已将其修改为执行重复内容检查。它使用similar_text函数将1页的内容(由用户指定)与它找到的每个页面的内容进行比较。
它有点慢,但它的工作。我遇到的唯一问题是它在前10个链接后停止,我无法弄清楚原因。我提前道歉,我知道这是相当多的代码。非常感谢任何帮助。
<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">
<div class="row"><label for="page1" class="small label"><strong>Page? </strong>: </label><input type="text" name="page1" id="page1" value="" size="40" /></div>
<div class="row"><label for="url" class="small label"><strong>Please Enter URL </strong>: </label><input type="text" name="url" id="url" value="" size="40" /></div>
<div class="row"><label for="maxlinks" class="small label"><strong>Number of links to get </strong>: </label><input type="text" name="maxlinks" id="maxlinks" value="25" size="3" maxlength="3" /></div>
<div class="row"><label for="linkdepth" class="small label"><strong>Links Maximum depth</strong> : </label> <select name="linkdepth" id="linkdepth" ><option value="1">1</option>
<option value="2" selected="selected">2</option>
<option value="3">3</option>
<option value="4">4</option>
<option value="5">5</option>
<option value="6">6</option>
</select></div>
<input type="submit" name="submit" style="font-weight: bold" value="Check links" id="submit" />
</form>
<?php
if (isset($_POST['submit'])){
$page1 = ($_POST['page1']);
$baseurl = ($_POST['url']);
$pages = array();
$i=($_POST['linkdepth']);
$maxlinks = (integer)$_POST['maxlinks'];
$domain= extract_domain_name($baseurl);
echo '<p class="small">Extracted domain name: <strong>'.$domain.'</strong>. ';
echo 'Maximum depth: <strong>'.$i.'</strong></p>';
function get_urls($page){
global $domain, $i;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $page);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, true);
/* Spoof the User-Agent header value; just to be safe */
curl_setopt($ch, CURLOPT_USERAGENT,
'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
/* I set timeout values for the connection and download
because I don't want my script to get stuck
downloading huge files or trying to connect to
a nonresponsive server. These are optional. */
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100);
curl_setopt($ch, CURLOPT_TIMEOUT, 100);
/* This ensures 404 Not Found (and similar) will be
treated as errors */
curl_setopt($ch, CURLOPT_FAILONERROR, 0);
/* Download the page */
$html = curl_exec($ch);
/* in case of an error*/
if(curl_exec($ch) === false)
{
echo '<p class="small">Error. Please check URL: <strong style="color:#ae3100">' . curl_error($ch).'</p></strong>';
}
curl_close($ch);
if(!$html) return false;
/* Extract the BASE tag (if present) for
relative-to-absolute URL conversions later */
if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){
$base_url=$matches[1];
echo $base_url;
} else {
$base_url=$page; //base url = strani4ka s kotoroy na4inaetsa novaja porverka
}
$links=array();
$html = str_replace("\n", ' ', $html);
preg_match_all('/<a[\s]+[^>]*href\s*=\s*[\"\']?([^\'\" >]+)[\'\" >]/i', $html, $m);
/* this regexp is a combination of numerous
versions I saw online*/
foreach($m[1] as $url) {
$url=trim($url);
/* get rid of PHPSESSID, #linkname, & and javascript: */
$url=preg_replace(
array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i'),
array('','','&',''),
$url);
/* turn relative URLs into absolute URLs.
relative2absolute() is defined further down
below on this page. */
$url = relative2absolute($base_url, $url);
// check if in the same (sub-)$domain
if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url))
{
$depth= substr_count($url, "/")-2 ;
/* Counts slashes in URL
Responsible for link depth
*/
if ($depth <= $i){
if(!in_array($url, $links, check)) $links[]=$url;
} }
}
return $links;
}
// Functions to crawl the next page
function next_page(){
global $pages;
$k=0;
foreach( array_keys($pages) as $k=> $page){
if($pages[$page] == NULL){
$k++;
echo "[$k] - ";
return $page;
}
}
return NULL;
}
function add_urls($page){ // ads new unique urls in to array and checks each url for Server Header Status
global $pages, $maxlinks;
$start = microtime();
$urls = get_urls($page);
$resptime = microtime() - $start; // with microtime it is possible to find out on which page the crowler stops responding.
//Start checking for Server Header
$ch = curl_init($page);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// Execute
curl_exec($ch);
$info = curl_getinfo($ch);
print "$page";
// If the status code os 200, then print OK, else = NO
// if($info['http_code']==200) {
$page1 = ($_POST['page1']);
$page1data = file_get_contents($page1);
$page2 = file_get_contents($page);
$i = similar_text($page1data, $page2, $p);
$p = round($p, 2);
echo ' - Match Percentage:' . $p . '%';
// } else {
// echo '<strong style="color:#ba3d00"> NO </strong>';}
/* echo substr(($resptime),0,5). " seconds"; */ // Activate ths to see how much time it takes to crawl
echo '<br/>';
curl_close($ch); // Close handle
$pages[$page] = array ('resptime' => floor($resptime * 9000), 'url' => $page);
foreach($urls as $url){
if(!array_key_exists($url, $pages) && !in_array($url, $pages) && count($pages)<$maxlinks){
$pages[$url] = NULL;
}
}
}
echo '[1] - '; // this is for the first input url, as it will be extracted from input
add_urls($baseurl);
while(($page= next_page()) != NULL ) //while there are urls available
{
add_urls($page);
}
echo '<p class="small">Amount of crawled links: <strong>'.count ($pages).'</strong></p>';
if (count($pages)<$maxlinks) echo '<p class="small">Sorry, no more links to crawl!!</p>';// count all extracted Urls
}
?><?php
function extract_domain_name($url){
/* old domain extractor
if(preg_match('@^(?:http:\/\/)?([^\/]+)@i', $url, $matches)) {
return trim(strtolower($matches[1]));
} else {
return '';
}*/
preg_match("/^(http:\/\/)?([^\/]+)/i", $url, $matches);
$host = $matches[2];
// get last two segments of host name
preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches);
return $matches[0];
}
function relative2absolute($absolute, $relative) {
$p = parse_url($relative);
if($p["scheme"])return $relative;
extract(parse_url($absolute));
$path = dirname($path);
if($relative{0} == '/')
{
$newPath = array_filter(explode("/", $relative));
}
else
{
$aparts = array_filter(explode("/", $path));
$rparts = array_filter(explode("/", $relative));
$cparts = array_merge($aparts, $rparts);
$k = 0;
$newPath = array();
foreach($cparts as $i => $part)
{
if($part == '..')
{
$k = $k - 1;
$newPath[$k] = null;
}
else
{
$newPath[$k] = $cparts[$i];
$k = $k + 1;
}
}
$newPath = array_filter($newPath);
}
$path = implode("/", $newPath);
$url = "";
if($scheme)
{
$url = "$scheme://";
}
if($user)
{
$url .= "$user";
if($pass)
{
$url .= ":$pass";
}
$url .= "@";
}
if($host)
{
$url .= "$host/";
}
$url .= $path;
return $url;
}
##################################################
答案 0 :(得分:1)
如果它恰好在约30秒后停止。将以下内容添加到脚本顶部:set_time_limit(0);
通常PHP脚本会在30秒后被终止,但您可以将其覆盖。