我目前有一个具有以下结构的scrapy项目:
.
├── articlescraper
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── items.pyc
│ ├── pipelines.py
│ ├── pipelines.pyc
│ ├── scheduler.py
│ ├── scheduler.pyc
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── nujijspider.py
│ └── nujijspider.pyc
└── scrapy.cfg
现在在我的scheduler.py中我调用了这个函数:
from Queue import Queue
import threading
import time
import sys
import imp
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class Scheduler(object):
"""Scheduler is the base class for the Scheduler
This class loops on the queue object and calls the needed crawlers from within
Reschedules articles to be crawled again
"""
def __init__(self):
self.articleInformation = {}
self.taskQueue = Queue()
def append_work(self, work):
if work['Url'] not in self.articleInformation:
self.articleInformation[work['Id']] = work
print self.articleInformation
def schedule(self):
article = self.taskQueue.get()
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("articlecommentspider",url="///")
process.start()
但是这导致了scrapy的这个错误:
File "/usr/local/lib/python2.7/site-packages/scrapy/spiderloader.py", line 43, in load
raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: articlecommentspider'
蜘蛛:
class ArticleCommentSpider(scrapy.Spider):
"""ArticleCommentSpider Can look for all the the comments on an article page
Those article pages are specific to www.nujij.nl and nu.nl related websites
"""
name = 'articlecommentspider'
allowed_domains = ['nujij.nl']
def __init__(self, *args, **kwargs):
super(ArticleCommentSpider, self).__init__(*args, **kwargs)
arg = args.get('url')
if not arg:
print arg
self.start_urls = arg
def parse(self,response):
title = response.xpath("//h1"+matchClass('title')+"//text()").extract()[1] ## Title is weird defined inside Nujij.nl (<h1 class="title">)
articleId = prog.search(response.url).group().split('.')[0] ## This regex matches things like (873238.lynkx in url)
response.replace(body=response.body.replace('<br>', '\n')) # Needed for comments which have alot of <br> tags
for item in response.xpath('//ol[@class="reacties"]//li'+ matchClass('hidenum')): ## Every list item underneath the reactions
commentId = item.xpath('@id').extract_first() ## Id from the first list item (unique on every article)
c = item.xpath('.//div[@class="reactie-body "]/text()').extract()
c = ''.join(map(unicode.strip, c))
date = item.xpath('normalize-space(.//span[@class="tijdsverschil"])').extract()
date = dateparser.parse("".join(date))
articleComment = Comment()
articleComment['Id'] = articleId+"+"+str(commentId)
articleComment['Source'] = str(title)
articleComment['IndexedAt'] = date
articleComment['Url'] = response.url
articleComment['Parser'] = "nujij.nl"
articleComment['Content'] = str(c)
articleComment['Subject'] = {
"url" : response.url,
"title": str(title)
}
print articleComment
当列出带有scrapy列表的刮刀时,我得到了它们。调度程序文件也在articlescraper项目中。我怎么能不在这个过程中调用刮刀
答案 0 :(得分:0)
你需要提供一个scrapy.spider
类,而不是蜘蛛的名字。
from articlescraper.nujijspider import NujijSpider
process.crawl(NujijSpider)