我有这个功能从外部URL获取一些信息。 问题是,如果网站得到机器人无索引,此功能崩溃后会导致foreach循环崩溃。
错误消息:
警告:file_get_contents(http://webontwerp-arnhem.nl/contact):无法打开流:第79行/var/www/vhosts/free-sitemap-generator.com/httpdocs/includes/cra/simple_html_dom.php中的连接被拒绝< / p>
致命错误:未捕获错误:在/var/www/vhosts/free-sitemap-generator.com/httpdocs/includes/cra/xml-functions.php:60中调用boolean上的成员函数find()堆栈跟踪:#0 /var/www/vhosts/free-sitemap-generator.com/httpdocs/crawler.php(44):crawl_site('http://webontwe ...')#1 {main}抛出/ var /第60行的www / vhosts / free-sitemap-generator.com / httpdocs / includes / cra / xml-functions.php
功能:
function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
$contents = file_get_contents($url, $use_include_path, $context, $offset);
if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
{
return false;
}
$dom->load($contents, $lowercase, $stripRN);
return $dom;
}
使用循环调用函数:
function crawl_site($u) {
$urlList = array();
global $crawled_urls, $found_urls;
$uen = urlencode($u);
if ((array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time())))) {
$html = file_get_html($u);
$crawled_urls[$uen] = date("YmdHis");
foreach($html -> find("a")as $li) {
$url = perfect_url($li -> href, $u);
$enurl = urlencode($url);
$str = basename($url);
$dirn = dirname($url);
if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 3) != "tel" && substr($url, 0, 5) != "phone" && substr($url, 0, 5) != "skype" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) {
$found_urls[$enurl] = 1;
$pos = strpos($str[0], '#');
$ext = strpos($url, $u);
if ($ext != = false && $pos == = false) {
echo "<li><div class='url-row'>$dirn/<span class='strong'>$str</span></div></li>";
array_push($urlList, $url);
}
}
}
}
}
答案 0 :(得分:0)
您可以使用CURL代替file_get_contents()
<?php
$url = 'http://webontwerp-arnhem.nl/contact';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
$contents = curl_exec ($ch);
curl_close ($ch);