我正在尝试使用Simple HTML Dom和Curl抓取网站列表。我可以使用简单的htmldom连接到网站并查找我需要的列表,但是,当结果分页时,我遇到了麻烦。我找到了“下一个”链接的来源,我正在用我认为正确的地址,但是,我编写的解析结果的函数是生成第一页的副本。 简而言之,我两次回应同一页面。我正在研究Fiddler中的整个帖子/ GET流程,据我所知,GET调用分页页面中的标题是正确的。
任何人都可以发现它出错的地方吗?
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>Chelmsford Planning Applications</title>
</head>
<body>
<?php
require_once('simple_html_dom.php');
?>
<?php
$html = new simple_html_dom();
error_reporting(E_ALL);
$cookie_file = "cookies/cookiejar.txt";
$page = "http://publicaccess.chelmsford.gov.uk/online-applications/advancedSearchResults.do?action=firstPage";
// Form fields are the fields being posted by the search form.
$form_fields = "searchCriteria.reference=&searchCriteria.planningPortalReference=&searchCriteria.alternativeReference=&searchCriteria.description=&searchCriteria.applicantName=&searchCriteria.caseType=&searchCriteria.ward=&searchCriteria.parish=&searchCriteria.agent=&searchCriteria.caseStatus=&searchCriteria.caseDecision=&searchCriteria.appealStatus=&searchCriteria.appealDecision=&searchCriteria.developmentType=&caseAddressType=Application&searchCriteria.address=&date%28applicationReceivedStart%29=08%2F11%2F2013&date%28applicationReceivedEnd%29=18%2F11%2F2013&date%28applicationValidatedStart%29=&date%28applicationValidatedEnd%29=&date%28applicationCommitteeStart%29=&date%28applicationCommitteeEnd%29=&date%28applicationDecisionStart%29=&date%28applicationDecisionEnd%29=&date%28appealDecisionStart%29=&date%28appealDecisionEnd%29=&date%28applicationDeterminedStart%29=&date%28applicationDeterminedEnd%29=&searchType=Application";
$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIESESSION, true);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_ENCODING, "");
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)');
curl_setopt($ch, CURLOPT_URL, $page);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $form_fields);
curl_setopt($ch, CURLOPT_VERBOSE, TRUE);
echo 'Curl error: ' . curl_error($ch); //no errrors
$str = curl_exec($ch);
curl_close($ch);
//Simple HTMLDOM Here....
$html = str_get_html($str);
$ret = $html->find('ul li.searchresult');
foreach ($ret as $list) {
echo $list;
}
//If we find a NEXT button
if ($next = $html->find('a[class=next]', 0)) {
//We apend the next HREF to the base referer
$base = "http://publicaccess.chelmsford.gov.uk";
$url = $base . $next->href;
// Display the Url to see if we are going to the right address.
echo $url;
//If we find a next link, we call the get_data function to follow the link
//and search for the same type of data to display.
get_data($url);
}
?>
<?php
function get_data($url)
{
$new_page = $url;
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow redirects
curl_setopt($ch, CURLOPT_ENCODING, ""); // handle all encodings
curl_setopt($ch, CURLOPT_AUTOREFERER, true); // set referer on redirect
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)');
curl_setopt($ch, CURLOPT_URL, $new_page); // This is the address of the paginated page
$str1 = curl_exec($ch);
curl_close($ch);
$html = str_get_html($str1);
# get all elements with class="searchresult"
$ret = $html->find('ul li.searchresult');
foreach ($ret as $list) {
echo $list;
}
// Display the Url again just to make sure our function is being called.
echo "the new page is " . $new_page;
}
?>
</body>
</html>