有没有办法在Spider类终止之前触发它?
我可以自己终止蜘蛛,就像这样:
class MySpider(CrawlSpider):
#Config stuff goes here...
def quit(self):
#Do some stuff...
raise CloseSpider('MySpider is quitting now.')
def my_parser(self, response):
if termination_condition:
self.quit()
#Parsing stuff goes here...
但我找不到任何关于如何确定蜘蛛何时会自然戒烟的信息。
答案 0 :(得分:63)
看起来您可以通过dispatcher
注册信号监听器。
我会尝试类似的事情:
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# second param is instance of spder about to be closed.
答案 1 :(得分:29)
只是为了更新,你可以像这样调用closed
函数:
class MySpider(CrawlSpider):
def closed(self, reason):
do-something()
答案 2 :(得分:12)
对于Scrapy版本 1.0.0 + (它也适用于旧版本)。
from scrapy import signals
class MySpider(CrawlSpider):
name = 'myspider'
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
print('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
print('Closing {} spider'.format(spider.name))
一个很好的用法是将tqdm进度条添加到scrapy spider。
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tqdm import tqdm
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['somedomain.comm']
start_urls = ['http://www.somedomain.comm/ccid.php']
rules = (
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccds.php\?id=.*'),
callback='parse_item',
),
Rule(LinkExtractor(allow=r'^http://www.somedomain.comm/ccid.php$',
restrict_xpaths='//table/tr[contains(., "SMTH")]'), follow=True),
)
def parse_item(self, response):
self.pbar.update() # update progress bar by 1
item = MyItem()
# parse response
return item
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(MySpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_opened(self, spider):
self.pbar = tqdm() # initialize progress bar
self.pbar.clear()
self.pbar.write('Opening {} spider'.format(spider.name))
def spider_closed(self, spider):
self.pbar.clear()
self.pbar.write('Closing {} spider'.format(spider.name))
self.pbar.close() # close progress bar
答案 3 :(得分:7)
对我来说,接受的不起作用/至少对scrapy 0.19来说已经过时了。 我得到它与以下工作:
from scrapy.signalmanager import SignalManager
from scrapy.xlib.pydispatch import dispatcher
class MySpider(CrawlSpider):
def __init__(self, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
SignalManager(dispatcher.Any).connect(
self.closed_handler, signal=signals.spider_closed)
def closed_handler(self, spider):
# do stuff here
答案 4 :(得分:1)
对于最新版本(v1.7),只需在蜘蛛类中定义<img srcset="tiger-320w.jpg 320w,
tiger-480w.jpg 480w,
tiger-800w.jpg 800w"
sizes="(max-width: 320px) 280px,
(max-width: 480px) 440px,
800px"
src="tiger-800w.jpg" alt="Bengal tiger">
方法即可。
closed(reason)
:在蜘蛛关闭时调用。此方法提供了一个快捷方式 signal.connect()代表蜘蛛关闭信号。
答案 5 :(得分:0)
如果你有很多蜘蛛,想要在每个蜘蛛关闭之前做点什么,也许在你的项目中添加statscollector会很方便。
设置中的:
STATS_CLASS = 'scraper.stats.MyStatsCollector'
和收藏家:
from scrapy.statscollectors import StatsCollector
class MyStatsCollector(StatsCollector):
def _persist_stats(self, stats, spider):
do something here