我正尝试从状态为1的scrapy退出。该脚本正在通过DAG运行。但是任务不会以状态代码1退出
try:
photo = requests.get(self.img_url + '/' + inmate['ImageId']).content
except Exception as e:
logging.error("Error is " + str(e))
self.spider.closespider_errorcount(1)
raise CloseSpider(e)
sys.exit(1)
[2019-08-21 09:34:16,199] {{bash_operator.py:127}}信息- scrapy.exceptions.CloseSpider [2019-08-21 09:34:18,737] {{bash_operator.py:131}}信息-命令退出,返回码0 [2019-08-21 09:34:18,852] {{base_task_runner.py:101}}信息-工作 26813:子任务crawl_7001 /usr/local/lib/python3.6/site-packages/psycopg2/init.py:144: 用户警告:psycopg2滚轮包将从发行版中重命名 2.8;为了保持二进制安装,请改用“ pip install psycopg2-binary”。有关详细信息,请参见: http://initd.org/psycopg/docs/install.html#binary-install-from-pypi。 [2019-08-21 09:34:18,853] {{base_task_runner.py:101}}信息-工作 26813:子任务crawl_7001“”“)[2019-08-21 09:34:21,189] {{logging_mixin.py:95}}信息-[2019-08-21 09:34:21,189] {{jobs.py:2630}}警告-此实例的状态已在外部 走向成功。服用毒药。 [2019-08-21 09:34:21,351] {{helpers.py:281}}信息-将Signals.SIGTERM发送到GPID 4765 [2019-08-21 09:34:21,436] {{helpers.py:263}}信息-流程 psutil.Process(pid = 4765,status ='terminated')(4765)终止于 退出代码-15 [2019-08-21 09:34:21,437] {{logging_mixin.py:95}}信息- [2019-08-21 09:34:21,437] {{jobs.py:2562}}信息-任务退出 返回代码0
答案 0 :(得分:0)
我遇到了同样的问题,这是我的解决方案:
想法是使用signals.spider_closed
import scrapy
from scrapy.exceptions import CloseSpider
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse, errback=self.parse_error)
def parse(self, response):
pass
def parse_error(self, response):
raise CloseSpider('My reason')
import logging
import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.utils import log
from scrapy.utils.project import get_project_settings
class ScrapyCrawlerProcessWrapper:
crawler_error = None
crawler_closed = None
def __init__(self, level_level: str = 'INFO'):
self.crawler_error = None
self.process = CrawlerProcess(get_project_settings(), False)
log.dictConfig({
'version': 1,
'disable_existing_loggers': True,
'loggers': {
'scrapy': {
'level': level_level,
}
}
})
def add_spider(self, spider: scrapy.Spider, **know_args):
logging.debug('add_spider: %s', spider)
self.process.crawl(spider, **know_args)
def start_crawl(self):
for crawler in self.process.crawlers:
crawler.signals.connect(receiver=self.spider_error, signal=signals.spider_error)
crawler.signals.connect(receiver=self.spider_closed, signal=signals.spider_closed)
# blocks the process till all crawlers finished
self.process.start()
# check for exceptions
if self.crawler_error:
self.crawler_error.raiseException()
# check for exceptions
if self.crawler_closed:
logging.error('The crawler had been closed because: %s' % self.crawler_closed)
exit(1)
def spider_error(self, *args, **kwargs):
if 'failure' in kwargs:
print(kwargs['failure'])
self.crawler_error = kwargs['failure']
def spider_closed(self, spider, reason):
if reason != 'finished':
self.crawler_closed = reason