Goutte为多个网址抓取返回错误的网址

时间:2016-01-25 06:24:34

标签: php goutte domcrawler

我正在使用https://github.com/FriendsOfPHP/Goutte。 点击while循环中的分页链接时,我一直收到错误的网址。

对象上的 selectLink 返回第一个while循环的右url。看起来第二个循环为 selectLink 返回了错误的值。

这是代码。

public function __construct(Goutte\Client $client){

    $this->client = $client;
}

public function parse(){

    $url = "https://www.nextag.com/Arts-Entertainment--zz2702147z0z1zB6c4z5---html";

    // crawl through first page
    $crawler    = $this->client->request('GET', $url);

    // first page pagination links
    $links      = $this->paginationCrawler($crawler);

    $linkBatch  = array(); 

    // get all pagination links and check if the next 10 links are available 
    list($linkBatch[], $_nextPage) = $this->getPaginationLinks($links);

    // if $_nextPage == '11+/21+/etc' then crawl through all links
    while($_nextPage != 'false'){

        $link                           = $links->selectLink($_nextPage)->link();

        $crawler                        = $this->client->click($link);

        $links                          = $this->paginationCrawler($crawler);

        list($linkBatch[], $_nextPage)  = $this->getPaginationLinks($links);

    }

    dd($linkBatch);
}   

public function paginationCrawler($crawler){

    return $crawler->filter('#pagination');
}

public function getPaginationLinks($links){

    $allLinks = $links->filter('#numbers a');

    $linkNodes = $allLinks->each(function(Crawler $a) {

        return  $a->attr('href');

    });

    $lastPage = trim($links->filter('#numbers :last-child')->text());

    if (strpos($lastPage,'+') === false) {

        $lastPage = 'false';

    }

    return array($linkNodes, $lastPage);
}

这是输出:

enter image description here

0 个答案:

没有答案