我尝试在scrapy crawlspider中处理302个http代码。我在google和此网站中以及在scrapy文档https://docs.scrapy.org/en/latest/topics/downloader-middleware.html?highlight=302中进行了搜索,并尝试了以下代码
handle_httpstatus_list = [302]
meta = {'dont_redirect': True, "handle_httpstatus_list": [302]}
# and
custom_settings = {'REDIRECT_ENABLED': False}
所有这些都不适合我。
这是我的代码
class LagouSpider(CrawlSpider):
handle_httpstatus_list = [302]
meta = {'dont_redirect': True, "handle_httpstatus_list": [302]}
name = 'lagou'
allowed_domains = ['www.lagou.com']
start_urls = ['https://www.lagou.com']
login_url = "https://passport.lagou.com/login/login.html"
custom_settings = {'REDIRECT_ENABLED': False}
rules = (
Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True),
)
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/',
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'Accept-Encoding': 'gzip, deflate, br',
'X-Requested-With': 'XMLHttpRequest'
}
def start_requests(self):
global rc, im
browser = webdriver.Chrome(executable_path="/home/wqh/下载/chromedriver")
browser.get(self.login_url)
# ··········(some code)
return [scrapy.Request(self.start_urls[0], cookies=cookie_dict,
dont_filter=True)]
# I have tried to use meta in scrapy.request and it failed.
# return [scrapy.Request(self.start_urls[0], cookies=cookie_dict,
meta=self.meta)]
def parse_job(self, response):
if response.status == 302:
print("302")
time.sleep(100)
当出现第302页状态时,它永远不会打印302
。