Scrapy - ItemPipeline不输入Process Items

时间:2017-06-27 20:02:00

标签: python django scrapy

我正在玩Scrapy,并尝试将Spiders生成的项目传递给ItemPipe。问题是,在输入管道时,从不调用实际的process_items方法。尽管已经调试了蜘蛛并且看到它正确地产生了引用项目。总而言之,当我调试quotes_spider.py时,我可以看到我返回的'item'对象是Quote类型,其中author / quote具有期望值。类似地,正确加载管道并创建json文件,我只是从不输入process_items方法或写入此类文件。有什么建议吗?

quotes_spider.py

import scrapy
from scrapy.loader import ItemLoader
from tutorial.item_loaders import QuoteLoader
from tutorial.items import Quote


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]

    def parse(self, response):
        for quote in response.xpath('//div[contains(@class, "quote")]'):
            l = QuoteLoader(item=Quote(), response=response)
            content = quote.xpath('./span[contains(@itemprop, "text")]/text()').extract_first()
            l.add_value('quote', content)
            author = quote.xpath('./span/small[contains(@itemprop, "author")]/text()').extract_first()
            l.add_value('author', author)

            item = l.load_item()

            yield item

Items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TutorialItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class Quote(scrapy.Item):
    quote = scrapy.Field()
    author = scrapy.Field()

item_loaders.py

from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join


class QuoteLoader(ItemLoader):
    default_output_processor = TakeFirst()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json


class QuotePipeline(object):

    def open_spider(self, spider):
        self.file = open('itemss.json', 'w')
        pass

    def close_spider(self, spider):
        self.file.close()

    def process_items(self, item, spider):
        print "HELLO"
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return "HELLO"

在settings.py中我已正确定义:

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'tutorial.pipelines.QuotePipeline': 300,
}

1 个答案:

答案 0 :(得分:1)

func retrieveToken(for file: File) -> Promise<Any> {
    return Promise<Any> { fulfill, reject in
        service.determineToken(for: file) { token, error in
            // if any error, reject

            guard let token = token, error == nil else {
                reject(error ?? FileError.someError)
                return
            }

            // if I don't have to make recursive call, `fulfill` immediately.
            // in my example, I'm going to see if there are subfiles, and if not, `fulfill` immediately.

            guard let subfiles = file.subfiles else {
                fulfill(token)
                return
            }

            // if I got here, there are subfiles and I'm going to start recursive set of promises

            self.retrieveTokens(for: subfiles).then { tokens in
                fulfill(tokens)
            }.catch { error in
                reject(error)
            }
        }
    }
}