我试图从网站上删除一些内容,但我遇到的问题可能很简单,但我找不到解决方案。对于它工作的第一页,但是当我浏览(使用curl)以下页面时,我仍然得到第1页的内容,这很奇怪。我猜这个网站有一些报废保护,但我找不到识别它们的方法......
<?php
$i = 1;
$links = array();
while($i < 3)
{
$ch = curl_init();
$url = 'http://www.gites-de-france.com/location-vacances-chambre-hotes.html?page=$i&chambre=o&xhtml=O&acc=CHAMBRE,CHAMBRE&order_by=prix&order_by_tri=asc&';
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_REFERER, "http://www.google.fr/");
curl_setopt($ch, CURLOPT_USERAGENT, "MozillaXYZ/1.0");
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, 100);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$html = curl_exec($ch);
curl_close($ch);
$doc = phpQuery::newDocument($html);
foreach($doc['.vignette a'] as $a){
$url = '';
$links[] .= pq($a)->attr('href');
}
$i++;
}
print_r($links);
?>
答案 0 :(得分:3)
这对我有用。
$i = 1;
$links = array();
$baseUrl = "http://www.gites-de-france.com/location-vacances-chambre-hotes.html";
$param = array(
'chambre' => 'o',
'xhtml' => 'O',
'acc' => 'CHAMBRE,CHAMBRE',
'order_by' => 'prix',
'order_by_tri' => 'asc'
);
while($i < 3) {
$ch = curl_init();
$param['page'] = $i;
$url = "{$baseUrl}?" . http_build_query($param);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_REFERER, "http://www.google.fr/");
curl_setopt($ch, CURLOPT_USERAGENT, "MozillaXYZ/1.0");
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, 100);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'cookie.txt');
$html = curl_exec($ch);
curl_close($ch);
$doc = phpQuery::newDocument($html);
foreach($doc['.vignette a'] as $a){
$url = '';
$links[] .= pq($a)->attr('href');
}
$i++;
}
print_r($links);
注意:我在运行脚本之前手动创建了cookie.txt
文件。
答案 1 :(得分:1)
所以这是解决方案,这个网站使用cookie来传递会话号,所以你必须使用以下代码
curl_setopt($ch, CURLOPT_COOKIEJAR, '/tmp/cookie.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, '/tmp/cookie.txt');
它现在有效!
答案 2 :(得分:0)
您需要执行以下操作:
/**
*
*
* int $start start page number
* int $limit maximum number of results
* int $pgIncrmnt number of results per page
*
*
*/
$buffer = NULL;
$limit = 100;
for ($j = $startPageNum; $j <= $limitMaxResult; $j = $j + $pgIncrmnt) {
$chr = curl_init();
curl_setopt($chr, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.6 (KHTML, like Gecko) Chrome/16.0.897.0 Safari/535.6');
curl_setopt($chr, CURLOPT_HEADER, FALSE);
curl_setopt($chr, CURLOPT_URL, 'http://www.windowsphone.com/');
curl_setopt($chr, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($chr, CURLOPT_FRESH_CONNECT, TRUE);
curl_setopt($chr, CURLOPT_FORBID_REUSE, TRUE);
curl_setopt($chr, CURLOPT_FOLLOWLOCATION, TRUE);
$buffer .= curl_exec($chr);
curl_close($chr);
}