我正在尝试编写一个中间件,该中间件在出现异常或响应代码在列表[400、500、502、503、504、403、408、429]中时更改用户代理和代理。
但是问题是由于某些未知原因,刮板正在跳过某些请求,而我找不到原因。另外,我不确定settings.py中的DOWNLOADER_MIDDLEWARES设置的值。下面是我的中间件对象。
class GoogleCrawlerDownloaderMiddleware(object):
def __init__(self):
self.ua_list = user_agents #a list of user-agents
self.ua = choice(self.ua_list)
self.proxies = self.get_proxies()
self.proxy = choice(self.proxies)
def process_request(self, request, spider):
request.headers['User-Agent'] = self.ua
request.meta['proxy'] = self.proxy
def process_response(self, request, response, spider):
if response.status in [400, 500, 502, 503, 504, 403, 408, 429]:
raise IgnoreRequest
else:
return response
def process_exception(self, request, exception, spider):
try:
self.ua_list.remove(str(request.headers['User-Agent'],'UTF-8'))
self.proxies.remove(request.meta['proxy'])
#restore useragents and proxies
if len(self.ua_list) == 0:
self.ua_list = user_agents
if len(self.proxies) == 0:
self.proxies = self.get_proxies()
#if removing proxy and user agents possible then change the self.proxy & self.ua
self.proxy = choice(self.proxies)
self.ua = choice(self.ua_list)
except:
pass
#reschedule request
print(">>Rescheduling request<<")
sleep(2)
return request
我在做什么错,我的DOWNLOADER_MIDDLEWARES设置应该是什么?