我正在使用CURL从url中删除html。它在我使用的80%的网址中运行良好。但是有些网址似乎并不“可擦”。例如,当我试图刮取http://www.thefancy.com时,它不起作用。网站继续加载,最后它不会返回结果。问题是可以测试的:http://www.itemmized.com/test/test/这是我的代码:
if($_POST['submit']) {
function curl_exec_follow($ch, &$maxredirect = null) {
$mr = $maxredirect === null ? 5 : intval($maxredirect);
if (ini_get('open_basedir') == '' && ini_get('safe_mode' == 'Off')) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $mr > 0);
curl_setopt($ch, CURLOPT_MAXREDIRS, $mr);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
} else {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
if ($mr > 0)
{
$original_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$newurl = $original_url;
$rch = curl_copy_handle($ch);
curl_setopt($rch, CURLOPT_HEADER, true);
curl_setopt($rch, CURLOPT_NOBODY, true);
curl_setopt($rch, CURLOPT_FORBID_REUSE, false);
do
{
curl_setopt($rch, CURLOPT_URL, $newurl);
$header = curl_exec($rch);
if (curl_errno($rch)) {
$code = 0;
} else {
$code = curl_getinfo($rch, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302) {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newurl = trim(array_pop($matches));
// if no scheme is present then the new url is a
// relative path and thus needs some extra care
if(!preg_match("/^https?:/i", $newurl)){
$newurl = $original_url . $newurl;
}
} else {
$code = 0;
}
}
} while ($code && --$mr);
curl_close($rch);
if (!$mr)
{
if ($maxredirect === null)
trigger_error('Too many redirects.', E_USER_WARNING);
else
$maxredirect = 0;
return false;
}
curl_setopt($ch, CURLOPT_URL, $newurl);
}
}
return curl_exec($ch);
}
$ch = curl_init($_POST['form_url']);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec_follow($ch);
curl_close($ch);
echo $data;
答案 0 :(得分:4)
试试这个..希望这有帮助...
<?php
class Curl
{
public $cookieJar = "";
public function __construct($cookieJarFile = 'cookies.txt') {
$this->cookieJar = $cookieJarFile;
}
function setup()
{
$header = array();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
curl_setopt($this->curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7');
curl_setopt($this->curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($this->curl,CURLOPT_COOKIEJAR, $cookieJar);
curl_setopt($this->curl,CURLOPT_COOKIEFILE, $cookieJar);
curl_setopt($this->curl,CURLOPT_AUTOREFERER, true);
curl_setopt($this->curl,CURLOPT_FOLLOWLOCATION, true);
curl_setopt($this->curl,CURLOPT_RETURNTRANSFER, true);
}
function get($url)
{
$this->curl = curl_init($url);
$this->setup();
return $this->request();
}
function getAll($reg,$str)
{
preg_match_all($reg,$str,$matches);
return $matches[1];
}
function postForm($url, $fields, $referer='')
{
$this->curl = curl_init($url);
$this->setup();
curl_setopt($this->curl, CURLOPT_URL, $url);
curl_setopt($this->curl, CURLOPT_POST, 1);
curl_setopt($this->curl, CURLOPT_REFERER, $referer);
curl_setopt($this->curl, CURLOPT_POSTFIELDS, $fields);
return $this->request();
}
function getInfo($info)
{
$info = ($info == 'lasturl') ? curl_getinfo($this->curl, CURLINFO_EFFECTIVE_URL) : curl_getinfo($this->curl, $info);
return $info;
}
function request()
{
return curl_exec($this->curl);
}
}
{
$curl = new Curl();
$html = $curl->get("http://www.thefancy.com");
echo "$html";
}
?>
答案 1 :(得分:0)
可能你无法抓取http://www.thefancy.com,因为每次到达页面底部时,新内容都会加载,所以实际上你试图通过cUrl获取大量信息,这可能就是问题所在。 。你只是得到一个超时尝试用更大的数字设置php.ini中的超时并再次尝试。可能它的gona需要一段时间来加载,但我认为这样可以起作用。