我正在尝试让Scrapy在抓取器完成或中断时向我发送电子邮件。已经有一个用于发送统计信息的内置扩展程序,但我想将蜘蛛的错误附加为<spidername>-errors.log
,将已删除的项目附加为<spidername>-items.json
。
我已将回调连接到每个信号,但由于某种原因,只有最后一个信号被触发:
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
class StatusMailer(object):
def __init__(self, recipients, mail, crawler):
self.recipients = recipients
self.mail = mail
self.files = defaultdict(StringIO)
self.encoder = ScrapyJSONEncoder(crawler=crawler)
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')
def spider_error(self, failure, response, spider):
self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')
def spider_closed(self, spider):
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
)
有没有办法从Scrapy中访问导出的项目和蜘蛛的错误(可能会在将这些消息打印到控制台之前使用某种钩子拦截这些消息)?
答案 0 :(得分:9)
嗯,看起来这个问题比我想象的要简单得多。在完成写入后,您必须“回滚”StringIO
个实例:
def spider_closed(self, spider):
files = []
for name, contents in self.files.items():
contents.seek(0)
files.append((name, 'text/plain', contents))
return self.mail.send(
to=self.recipients,
subject="Crawler for %s finished" % spider.name,
body="",
attachs=files
)
对于任何有兴趣的人,这是我的电子邮件扩展名:
import gzip
import datetime
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
from collections import defaultdict
try:
from cStringIO import cStringIO as StringIO
except ImportError:
from StringIO import StringIO
def format_size(size):
for x in ['bytes', 'KB', 'MB', 'GB']:
if size < 1024.0:
return "%3.1f %s" % (size, x)
size /= 1024.0
class GzipCompressor(gzip.GzipFile):
extension = '.gz'
mimetype = 'application/gzip'
def __init__(self):
super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
self.read = self.fileobj.read
class PlainCompressor(StringIO):
extension = ''
mimetype = 'text/plain'
def read(self, *args, **kwargs):
self.seek(0)
return StringIO.read(self, *args, **kwargs)
@property
def size(self):
return len(self.getvalue())
class StatusMailer(object):
def __init__(self, recipients, mail, compressor, crawler):
self.recipients = recipients
self.mail = mail
self.encoder = ScrapyJSONEncoder(crawler=crawler)
self.files = defaultdict(compressor)
self.num_items = 0
self.num_errors = 0
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
compression = crawler.settings.get('STATUSMAILER_COMPRESSION')
if not compression:
compressor = PlainCompressor
elif compression.lower().startswith('gz'):
compressor = GzipCompressor
else:
raise NotConfigured
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
instance = cls(recipients, mail, compressor, crawler)
crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(instance.request_received, signal=signals.request_received)
return instance
def item_scraped(self, item, response, spider):
self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
self.num_items += 1
def spider_error(self, failure, response, spider):
self.files[spider.name + '.log'].write(failure.getTraceback())
self.num_errors += 1
def request_received(self, request, spider):
self.files[spider.name + '.log'].write(str(request) + '\n')
def spider_closed(self, spider, reason):
files = []
for name, compressed in self.files.items():
files.append((name + compressed.extension, compressed.mimetype, compressed))
try:
size = self.files[spider.name + '-items.json'].size
except KeyError:
size = 0
body='''Crawl statistics:
- Spider name: {0}
- Spider finished at: {1}
- Number of items scraped: {2}
- Number of errors: {3}
- Size of scraped items: {4}'''.format(
spider.name,
datetime.datetime.now(),
self.num_items,
self.num_errors,
format_size(size)
)
return self.mail.send(
to=self.recipients,
subject='Crawler for %s: %s' % (spider.name, reason),
body=body,
attachs=files
)
将其添加到您的settings.py
:
EXTENSIONS = {
'your_package.extensions.StatusMailer': 80
}
并配置它:
STATUSMAILER_RECIPIENTS = []
STATUSMAILER_COMPRESSION = 'gzip'
#STATUSMAILER_COMPRESSION = None
MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 587
MAIL_USER = ''
MAIL_PASS = ''