假设在我启动Scrapy抓取工具之前,我不确定是否通过HTTPS
或HTTP
投放了网站。因此,我始终首先使用以下代码尝试HTTPS
(例如https://www.wsiltv.com/random}:
import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError, ConnectionRefusedError
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self, category=None):
self.failed_urls = []
def start_requests(self):
urls = ['https://www.WSILTV.COM/random', # a few more URLs here
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse,
meta={'dont_redirect': True},
errback=self.err_callback)
def parse(self, response):
cur_datetime = datetime.now().strftime("%Y-%m-%d %H:%M")
yield {'text':response.body, 'fetch_date':cur_datetime}
def err_callback(self, failure):
# REF: https://doc.scrapy.org/en/latest/topics/request-response.html#using-errbacks-to-catch-exceptions-in-request-processing
if failure.check(HttpError):
# I want to record URLs that caused HTTP errors
self.failed_urls.append([failure.value.response.url, failure.value.response.status])
return
elif failure.check(DNSLookupError):
# Similarly, I'd like to record those which caused DNS errors
# E.g., 'https://non-existent-url.com'
self.failed_urls.append([failure.request.url, 'DNSLookupError'])
return
elif failure.check(ConnectionRefusedError):
# Here, I'd like to try HTTP version of the original URL such as 'http://www.WSILTV.COM/random'
new_request = Request('http://www.WSILTV.COM/random', callback=self.parse, meta={'dont_redirect': True})
else:
return
换句话说,我想知道如何从err_callback
方法排队新请求(抓取)。此外,是否有更好(更有效)的方式首先尝试https
然后如果失败,请稍后在Scrapy中尝试http
?
先谢谢您的回答!
答案 0 :(得分:3)
您可以修改
elif failure.check(ConnectionRefusedError):
# Here, I'd like to try HTTP version of the original URL such as 'http://www.WSILTV.COM/random'
new_request = Request('http://www.WSILTV.COM/random', callback=self.parse, meta={'dont_redirect': True})
else:
如下所示
elif failure.check(ConnectionRefusedError):
# Here, I'd like to try HTTP version of the original URL such as 'http://www.WSILTV.COM/random'
new_request = Request('http://www.WSILTV.COM/random', callback=self.parse, meta={'dont_redirect': True})
self.crawler.engine.crawl(new_request, self.crawler.spider)
else:
这会将请求从错误处理程序
添加到队列中