我正在研究爬虫,并且想通过遵守robots.txt进行礼貌的爬虫。由于它的爬网范围很广,因此日志文件的大小变得更大且更难处理,并且大多数日志记录是由于在大多数站点中找不到robots.txt所致。 所以我的问题是。有没有办法,我可以忽略与robots.txt相关的错误,而不必将它们记录下来,因为我不需要知道我们是否找到它。
我已经有了errback处理程序来处理对我的抓取工具的失败请求,但是它不适用于robots.txt,因为该请求是由抓取性的中间件发出的 下面是我的代码: 蜘蛛:
class MySpider(scrapy.Spider):
name = 'mobile'
def start_requests(self):
urls = [
'https://site1.com',
'http://site2.com'
]
for url in urls:
safe_no = 'test'
yield scrapy.Request(url=url, callback=self.parse,
errback=self.handle_error, meta={'safe_no': safe_no})
def parse(self, response):
safe_no = response.meta['safe_no']
html_doc = response.body
text_data, contacts, keep_no = self.get_contact(html_doc, response.url)
# print(contacts,keep_no)
link_found = False
data = []
parsed_uri = urlparse(response.url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
###Parse data and get contact....
if contacts:
yield{
'safe_no': safe_no,
'url': response.url,
'contacts': contacts,
# 'text_data': text_data
}
def handle_error(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError : "%s"', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
以下是搜寻器的日志:
2019-01-10 15:33:36 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://www.site1.com/robots.txt>: DNS lookup failed: no results for hostname lookup: www.site1.com.
Traceback (most recent call last):
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\users\username\appdata\local\programs\python\python37-32\lib\site-packages\twisted\internet\endpoints.py", line 975, in startConnectionAttempts
"no results for hostname lookup: {}".format(self._hostStr)
正如您在日志中看到的那样, handle_error 方法不适用于/robot.txt URL请求。 我进行了一些研究,发现可以将中间件配置为忽略某些错误,但到目前为止还没有运气。
答案 0 :(得分:1)
这里是您的handle_error
的一小部分重构。
def handle_error(self, failure):
# this is the original request
request = failure.request
if failure.check(DNSLookupError):
self.logger.error('DNSLookupError : "%s"', request.url)
elif request.url.endswith('/robots.txt'):
pass
elif failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError : "%s"', response.url)
elif failure.check(TimeoutError, TCPTimedOutError):
self.logger.error('TimeoutError : "%s"', request.url)
else:
request = failure.request
self.logger.error('Can not connect : "%s" ', request.url)
您的日志示例显示了DNS查找错误,无论特定的URL是什么,都应记录IMHO (即使不是robots.txt
,它也会失败。表示应跳过整个域,然后再跳过。)
答案 1 :(得分:0)
万一其他人都在读这本书,我做的一个简单的解决方案是采用基类并注释掉要打印的额外细节:
class MycrawlerRobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool("CUSTOM_ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b"")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get("dont_obey_robotstxt"):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value("robotstxt/request_count")
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
# if failure.type is not IgnoreRequest:
# logger.error(
# "Error downloading %(request)s: %(f_exception)s",
# {"request": request, "f_exception": failure.value},
# exc_info=failure_to_exc_info(failure),
# extra={"spider": spider},
# )
if failure.type is not IgnoreRequest:
logger.error(f"Error downloading robots.txt: {request}")
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f"robotstxt/exception_count/{failure.type}"
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)
然后我将其添加到settings.py
中:
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Custom one written so it doesn't log every 404 response
CUSTOM_ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
...
"mycrawler.middlewares.MycrawlerRobotsTxtMiddleware": 100,
}