Question

我正在尝试使用Simple HTML Dom和Curl抓取网站列表。我可以使用简单的htmldom连接到网站并查找我需要的列表，但是，当结果分页时，我遇到了麻烦。我找到了“下一个”链接的来源，我正在用我认为正确的地址，但是，我编写的解析结果的函数是生成第一页的副本。简而言之，我两次回应同一页面。我正在研究Fiddler中的整个帖子/ GET流程，据我所知，GET调用分页页面中的标题是正确的。

任何人都可以发现它出错的地方吗？

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>Chelmsford Planning Applications</title>
    </head>

    <body>
   <?php
require_once('simple_html_dom.php');
?>

<?php
$html = new simple_html_dom();
error_reporting(E_ALL);
$cookie_file = "cookies/cookiejar.txt";

$page = "http://publicaccess.chelmsford.gov.uk/online-applications/advancedSearchResults.do?action=firstPage";

// Form fields are the fields being posted by the search form.          
$form_fields = "searchCriteria.reference=&searchCriteria.planningPortalReference=&searchCriteria.alternativeReference=&searchCriteria.description=&searchCriteria.applicantName=&searchCriteria.caseType=&searchCriteria.ward=&searchCriteria.parish=&searchCriteria.agent=&searchCriteria.caseStatus=&searchCriteria.caseDecision=&searchCriteria.appealStatus=&searchCriteria.appealDecision=&searchCriteria.developmentType=&caseAddressType=Application&searchCriteria.address=&date%28applicationReceivedStart%29=08%2F11%2F2013&date%28applicationReceivedEnd%29=18%2F11%2F2013&date%28applicationValidatedStart%29=&date%28applicationValidatedEnd%29=&date%28applicationCommitteeStart%29=&date%28applicationCommitteeEnd%29=&date%28applicationDecisionStart%29=&date%28applicationDecisionEnd%29=&date%28appealDecisionStart%29=&date%28appealDecisionEnd%29=&date%28applicationDeterminedStart%29=&date%28applicationDeterminedEnd%29=&searchType=Application";

$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIESESSION, true);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_ENCODING, "");
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)');
curl_setopt($ch, CURLOPT_URL, $page);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $form_fields);
curl_setopt($ch, CURLOPT_VERBOSE, TRUE);
echo 'Curl error: ' . curl_error($ch); //no errrors 
$str = curl_exec($ch);
curl_close($ch);

//Simple HTMLDOM Here....
$html = str_get_html($str);
$ret  = $html->find('ul li.searchresult');
foreach ($ret as $list) {
    echo $list;
}
//If we find a NEXT button
if ($next = $html->find('a[class=next]', 0)) {
    //We apend the next HREF to the base referer  
    $base = "http://publicaccess.chelmsford.gov.uk";
    $url  = $base . $next->href;

    // Display the Url to see if we are going to the right address.
    echo $url;

    //If we find a next link, we call the get_data function to follow the link 
    //and search for the same type of data to display.
    get_data($url);
}
?>

 <?php
function get_data($url)
{
    $new_page = $url;

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_HEADER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow redirects
    curl_setopt($ch, CURLOPT_ENCODING, ""); // handle all encodings
    curl_setopt($ch, CURLOPT_AUTOREFERER, true); // set referer on redirect
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)');
    curl_setopt($ch, CURLOPT_URL, $new_page); // This is the address of the paginated page
    $str1 = curl_exec($ch);
    curl_close($ch);

    $html = str_get_html($str1);
    # get all elements with class="searchresult"                           
    $ret  = $html->find('ul li.searchresult');
    foreach ($ret as $list) {
        echo $list;

    }
    // Display the Url again just to make sure our function is being called.
    echo "the new page is " . $new_page;
}
?>

    </body>
    </html>

PHP卷曲页面刮板分页问题

0 个答案: