我正在尝试使用Celery运行Scrapy抓取过程。我看了很多教程,这就是每个人似乎都在做的事情,但它对我不起作用:
tasks.py
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from scrapy.conf import settings
from scraper.ubuntu_scraper.ubuntu_spider import UbuntuSpider
from celery.task.schedules import crontab
from celery.decorators import periodic_task
# this will run every minute
@periodic_task(run_every=crontab(hour="*", minute="*", day_of_week="*"))
def crawl():
crawler = DomainCrawlerScript()
return crawler.crawl()
class DomainCrawlerScript():
def __init__(self):
self.crawler = CrawlerProcess(settings)
self.crawler.install()
self.crawler.configure()
def _crawl(self):
self.crawler.crawl(UbuntuSpider)
self.crawler.start()
self.crawler.stop()
def crawl(self):
p = Process(target=self._crawl)
p.start()
p.join()
celery.py
from __future__ import absolute_import
import os
from celery import Celery
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'hardware.settings')
app = Celery('hardware', broker = 'django://', include=['scraper.tasks'])
app.config_from_object('django.conf:settings')
当我运行python manage.py celeryd -v 2 -B -s celery -E -l INFO -I scraper.tasks
时,我得到:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/celery/app/trace.py", line 238, in trace_task
R = retval = fun(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/celery/app/trace.py", line 416, in __protected_call__
return self.run(*args, **kwargs)
File "/home/olyazavr/Moka5/hardware/scraper/tasks.py", line 12, in crawl
crawler.install()
AttributeError: 'CrawlerProcess' object has no attribute 'install'
答案 0 :(得分:0)
了解scrapy crawl command是如何做到的并做同样的事情:
crawler = self.crawler_process.create_crawler()
spider = crawler.spiders.create(spname, **opts.spargs)
crawler.crawl(spider)
self.crawler_process.start()