如何忽略robots.txt错误以显示在日志中?

时间:2019-01-11 09:52:13

标签: python-3.x web-crawler scrapy-spider

我正在研究爬虫,并且想通过遵守robots.txt进行礼貌的爬虫。由于它的爬网范围很广,因此日志文件的大小变得更大且更难处理,并且大多数日志记录是由于在大多数站点中找不到robots.txt所致。 所以我的问题是。有没有办法,我可以忽略与robots.txt相关的错误,而不必将它们记录下来,因为我不需要知道我们是否找到它。

我已经有了errback处理程序来处理对我的抓取工具的失败请求,但是它不适用于robots.txt,因为该请求是由抓取性的中间件发出的 下面是我的代码: 蜘蛛:

class MySpider(scrapy.Spider):

name = 'mobile'

def start_requests(self):
    urls = [
         'https://site1.com',
         'http://site2.com'

     ]
     for url in urls:
         safe_no = 'test'
         yield scrapy.Request(url=url, callback=self.parse,
                              errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):

    safe_no = response.meta['safe_no']
    html_doc = response.body
    text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
    # print(contacts,keep_no)
    link_found = False
    data = []
    parsed_uri = urlparse(response.url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    ###Parse data and get contact....

        if contacts:
            yield{
                'safe_no': safe_no,
                'url': response.url,
                'contacts': contacts,
                # 'text_data': text_data
                }



 def handle_error(self, failure):

        if failure.check(HttpError):
            # these exceptions come from HttpError spider middleware
            # you can get the non-200 response
            response = failure.value.response
            self.logger.error('HttpError : "%s"', response.url)

         elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError : "%s"', request.url)


        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.logger.error('TimeoutError : "%s"', request.url)


        else:
            request = failure.request
            self.logger.error('Can not connect : "%s" ', request.url)

以下是搜寻器的日志:

    2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
    result = result.throwExceptionIntoGenerator(g)
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
    return g.throw(self.type, self.value, self.tb)
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
    defer.returnValue((yield download_func(request=request,spider=spider)))
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
    "no results for hostname lookup: {}".format(self._hostStr)

正如您在日志中看到的那样, handle_error 方法不适用于/robot.txt URL请求。 我进行了一些研究,发现可以将中间件配置为忽略某些错误,但到目前为止还没有运气。

2 个答案:

答案 0 :(得分:1)

这里是您的handle_error的一小部分重构。

def handle_error(self, failure):
    # this is the original request
    request = failure.request
    if failure.check(DNSLookupError):
        self.logger.error('DNSLookupError : "%s"', request.url)
    elif request.url.endswith('/robots.txt'):
        pass
    elif failure.check(HttpError):
        # these exceptions come from HttpError spider middleware
        # you can get the non-200 response
        response = failure.value.response
        self.logger.error('HttpError : "%s"', response.url)

    elif failure.check(TimeoutError, TCPTimedOutError):
        self.logger.error('TimeoutError : "%s"', request.url)

    else:
        request = failure.request
        self.logger.error('Can not connect : "%s" ', request.url)

您的日志示例显示了DNS查找错误,无论特定的URL是什么,都应记录IMHO (即使不是robots.txt,它也会失败。表示应跳过整个域,然后再跳过。)

答案 1 :(得分:0)

万一其他人都在读这本书,我做的一个简单的解决方案是采用基类并注释掉要打印的额外细节:


class MycrawlerRobotsTxtMiddleware:
    DOWNLOAD_PRIORITY = 1000

    def __init__(self, crawler):
        if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
            raise NotConfigured
        self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
        self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
        self.crawler = crawler
        self._parsers = {}
        self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))

        # check if parser dependencies are met, this should throw an error otherwise.
        self._parserimpl.from_crawler(self.crawler, b"")

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        if request.meta.get("dont_obey_robotstxt"):
            return
        d = maybeDeferred(self.robot_parser, request, spider)
        d.addCallback(self.process_request_2, request, spider)
        return d

    def process_request_2(self, rp, request, spider):
        if rp is None:
            return

        useragent = self._robotstxt_useragent
        if not useragent:
            useragent = request.headers.get(b"User-Agent", self._default_useragent)
        if not rp.allowed(request.url, useragent):
            logger.debug(
                "Forbidden by robots.txt: %(request)s",
                {"request": request},
                extra={"spider": spider},
            )
            self.crawler.stats.inc_value("robotstxt/forbidden")
            raise IgnoreRequest("Forbidden by robots.txt")

    def robot_parser(self, request, spider):
        url = urlparse_cached(request)
        netloc = url.netloc

        if netloc not in self._parsers:
            self._parsers[netloc] = Deferred()
            robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
            robotsreq = Request(
                robotsurl,
                priority=self.DOWNLOAD_PRIORITY,
                meta={"dont_obey_robotstxt": True},
            )
            dfd = self.crawler.engine.download(robotsreq, spider)
            dfd.addCallback(self._parse_robots, netloc, spider)
            dfd.addErrback(self._logerror, robotsreq, spider)
            dfd.addErrback(self._robots_error, netloc)
            self.crawler.stats.inc_value("robotstxt/request_count")

        if isinstance(self._parsers[netloc], Deferred):
            d = Deferred()

            def cb(result):
                d.callback(result)
                return result

            self._parsers[netloc].addCallback(cb)
            return d
        else:
            return self._parsers[netloc]

    def _logerror(self, failure, request, spider):
        # if failure.type is not IgnoreRequest:
        #     logger.error(
        #         "Error downloading %(request)s: %(f_exception)s",
        #         {"request": request, "f_exception": failure.value},
        #         exc_info=failure_to_exc_info(failure),
        #         extra={"spider": spider},
        #     )
        if failure.type is not IgnoreRequest:
            logger.error(f"Error downloading robots.txt: {request}")
        return failure

    def _parse_robots(self, response, netloc, spider):
        self.crawler.stats.inc_value("robotstxt/response_count")
        self.crawler.stats.inc_value(
            f"robotstxt/response_status_count/{response.status}"
        )
        rp = self._parserimpl.from_crawler(self.crawler, response.body)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = rp
        rp_dfd.callback(rp)

    def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = f"robotstxt/exception_count/{failure.type}"
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None)

然后我将其添加到settings.py中:


# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True

DOWNLOADER_MIDDLEWARES = {
    ...
    "mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}