我想更改代理服务取决于http响应代码(例如代码= 500或404)我想触发process_exception
来更改代理地址。我创建了自己的proxyMiddleware
,我在process_request
中设置了代理,而在前面。当超时代理发生时,默认情况下会调用process_exception
。但是如何在自定义http状态下触发它呢?
来自scrapy docs:
Scrapy在下载处理程序或者调用时调用process_exception() process_request()(来自下载中间件)引发异常 (包括IgnoreRequest异常)
但我不知道如何实现这一点。
修改 我的蜘蛛代码
class Spider1(CrawlSpider):
# pageNumber = 0
keyword = ''
page = range(0, 40, 10)
allowed_domains = ['http://somedomain.com/search.html?query=football']
start_urls = ['http://somedomain.com/search.html?query=football']
rules = (Rule (LxmlLinkExtractor(), callback="parse", follow=True),)
def parse(self, response):
return item
我的settings.py:
DOWNLOADER_MIDDLEWARES = {
't.useragentmiddleware.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware':720,
't.cookiesmiddleware.CookiesMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 760,
't.proxymiddleware.ProxyMiddleware': 750
}
REDIRECT_ENABLED = True
和proxymiddleware.py:
import json, os, random, socket
import t as spider1
import scrapy.exceptions as exception
socket.setdefaulttimeout(5)
class ProxyMiddleware(object):
proxy = ''
proxyList = []
handle_httpstatus_list = [302, 400]
def __init__(self, settings):
f = open(t.location + '/data/proxy.json')
self.proxyList = json.load(f)['proxy']
f.close()
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
self.proxy = 'http://' + random.choice(self.proxyList)
os.environ['http_proxy'] = self.proxy
request.meta['proxy'] = self.proxy
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_exception(self, request, exception, spider):
proxy = request.meta['proxy']
try:
del self.proxyList[self.proxyList.index(proxy[8:])]
except ValueError:
pass
prox = 'http://' + random.choice(self.proxyList)
request.meta['proxy'] = prox
os.environ['http_proxy'] = prox
def process_response(self, request, response, spider):
'''this doesn't work'''
#raise exception.NotConfigured()
答案 0 :(得分:1)
有效的HTTP状态代码不是“例外”,因此它们通过process_response
路由。提取方法并从process_exception
和process_response
调用它。
CHANGE_PROXY_STATUS_LIST = [502, 404]
class ProxyMiddleware(object):
def change_proxy(request):
# Change proxy here
# Then check number of retries on the request
# and decide if you want to give it another chance.
# If not - return None else
return request
def process_exception(self, request, exception, spider):
return_request = change_proxy(request)
if return_request:
return return_request
def process_response(self, request, response, spider):
if response.status in CHANGE_PROXY_STATUS_LIST:
return_request = change_proxy(request)
if return_request:
return return_request
return response