我尝试在完成报废时自动重启我的蜘蛛,特别是当响应状态不好时。 例如,我有这段代码:
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from urlparse import urljoin
from bs4 import BeautifulSoup
from scrapy.spider import BaseSpider
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from datetime import datetime
import re
class level1(BaseSpider):
# Crawling Start
CrawlSpider.started_on = datetime.now()
name = "level1"
base_domain = 'http://www.google.com'
DOWNLOAD_DELAY = 3
restart=False
handle_httpstatus_list = [404, 302, 503, 999, 200] #add any other code you need
# Call sendEmail class
email = sendEmail()
# Call log settings
saveLog = runlog()
# Init
def __init__(self, url='', child='', parent=''):
self.start_urls = [url]
self.child = child
self.parent = parent
#run baby, run :)
super(level1, self).__init__(self.start_urls)
# On Spider Closed
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, reason):
if self.restart:
print "we need to retry"
super(level1, self).__init__(self.start_urls)
else:
print "ok"
# parsing time
work_time = datetime.now() - CrawlSpider.started_on
# Correct Finished
if reason == "finished":
print "finished"
def parse(self, response):
if response.status == 503:
self.restart = True
if response.status == 999:
self.restart = True
if str(response.status) == "200":
# Selector
sel = Selector(response)
todo
在spider_closed方法中,当响应状态不好时,我尝试重新启动我的蜘蛛,但它不起作用。
如何解决这个问题?
答案 0 :(得分:0)
我不确定调用 init 是否会重启你的蜘蛛。
在最坏的情况下,您可以编写一个单独的程序,使用此核心API(来自链接)生成爬虫,并根据需要重新启动。虽然我同意在蜘蛛脚本中重新启动会更简单。