Question

我有一个如下所示的爬网观察器：

<?php

namespace App\Observers;

use DOMDocument;
use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlObserver;

class PageCrawlObserver extends CrawlObserver
{
    private $pages =[];

    public function willCrawl(UriInterface $uri) {
        echo "Now crawling: " . (string) $uri . PHP_EOL;
    }

    /**
     * Called when the crawler has crawled the given url successfully.
     *
     * @param \Psr\Http\Message\UriInterface $url
     * @param \Psr\Http\Message\ResponseInterface $response
     * @param \Psr\Http\Message\UriInterface|null $foundOnUrl
     */
    public function crawled(UriInterface $url, ResponseInterface $response, ?UriInterface $foundOnUrl = null)
    {
        $path = $url->getPath();
        $doc = new DOMDocument();
        @$doc->loadHTML($response->getBody());
        $title = $doc->getElementsByTagName("title")[0]->nodeValue;

        $this->pages[] = [
            'path'=>$path,
            'title'=> $title
        ];

        dd($this->pages);

        exit;
    }

    /**
     * Called when the crawler had a problem crawling the given url.
     *
     * @param \Psr\Http\Message\UriInterface $url
     * @param \GuzzleHttp\Exception\RequestException $requestException
     * @param \Psr\Http\Message\UriInterface|null $foundOnUrl
     */
    public function crawlFailed(UriInterface $url, RequestException $requestException, ?UriInterface $foundOnUrl = null)
    {
        echo 'failed';
    }

    public function finishedCrawling()
    {
        echo 'crawled ' . count($this->pages) . ' urls' . PHP_EOL;
        foreach ($this->pages as $page){
            echo sprintf("Url  path: %s Page title: %s%s", $page['path'], $page['title'], PHP_EOL);
        }
    }
}

该代码有效！但是我必须将exit命令放在crawled函数的末尾。否则，它将永远加载直到收到超时消息。我试图增加PHP执行时间，但它会一直加载直到达到该时间并显示超时错误。

我需要运行finishedCrawling函数，所以我不想像上面的代码一样放置exit函数。有什么想法吗？谢谢！

Spatie Crawler永远加载

0 个答案: