Question

我目前有一个具有以下结构的scrapy项目：

.
├── articlescraper
│   ├── __init__.py
│   ├── __init__.pyc
│   ├── items.py
│   ├── items.pyc
│   ├── pipelines.py
│   ├── pipelines.pyc
│   ├── scheduler.py
│   ├── scheduler.pyc
│   ├── settings.py
│   ├── settings.pyc
│   └── spiders
│       ├── __init__.py
│       ├── __init__.pyc
│       ├── nujijspider.py
│       └── nujijspider.pyc
└── scrapy.cfg

现在在我的scheduler.py中我调用了这个函数：

from Queue import Queue
import threading
import time
import sys
import imp
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


class Scheduler(object):
    """Scheduler is the base class for the Scheduler
       This class loops on the queue object and calls the needed crawlers from within
       Reschedules articles to be crawled again 
    """
    def __init__(self):
        self.articleInformation = {}
        self.taskQueue = Queue()

    def append_work(self, work):
        if work['Url'] not in self.articleInformation:
            self.articleInformation[work['Id']] = work

        print self.articleInformation

    def schedule(self):
        article = self.taskQueue.get()

settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("articlecommentspider",url="///")
process.start()

但是这导致了scrapy的这个错误：

  File "/usr/local/lib/python2.7/site-packages/scrapy/spiderloader.py", line 43, in load
    raise KeyError("Spider not found: {}".format(spider_name))
KeyError: 'Spider not found: articlecommentspider'

蜘蛛：

class ArticleCommentSpider(scrapy.Spider):
    """ArticleCommentSpider Can look for all the the comments on an article page
       Those article pages are specific to www.nujij.nl and nu.nl related websites 
    """
    name = 'articlecommentspider'
    allowed_domains = ['nujij.nl']

def __init__(self, *args, **kwargs):
    super(ArticleCommentSpider, self).__init__(*args, **kwargs) 
    arg = args.get('url')
    if not arg:
        print arg
    self.start_urls = arg


def parse(self,response):
    title = response.xpath("//h1"+matchClass('title')+"//text()").extract()[1] ## Title is weird defined inside Nujij.nl (<h1 class="title">)
    articleId = prog.search(response.url).group().split('.')[0] ## This regex matches things like (873238.lynkx in url)

    response.replace(body=response.body.replace('<br>', '\n')) # Needed for comments which have alot of <br> tags
    for item in response.xpath('//ol[@class="reacties"]//li'+ matchClass('hidenum')): ## Every list item underneath the reactions
        commentId = item.xpath('@id').extract_first() ## Id from the first list item (unique on every article)
        c = item.xpath('.//div[@class="reactie-body "]/text()').extract()
        c = ''.join(map(unicode.strip, c))
        date = item.xpath('normalize-space(.//span[@class="tijdsverschil"])').extract()
        date = dateparser.parse("".join(date))

        articleComment = Comment()
        articleComment['Id'] = articleId+"+"+str(commentId)
        articleComment['Source'] = str(title)
        articleComment['IndexedAt'] = date
        articleComment['Url'] = response.url 
        articleComment['Parser'] = "nujij.nl"
        articleComment['Content'] = str(c)
        articleComment['Subject'] = {
            "url" : response.url, 
            "title": str(title)
        }   
        print articleComment

当列出带有scrapy列表的刮刀时，我得到了它们。调度程序文件也在articlescraper项目中。我怎么能不在这个过程中调用刮刀

Answer 1

你需要提供一个scrapy.spider类，而不是蜘蛛的名字。

from articlescraper.nujijspider import NujijSpider
process.crawl(NujijSpider)

查看official documentation here

Scrapy无法在当前项目中找到我的蜘蛛

1 个答案: