我是python的初学者,我正在使用scrapy。我使用xmlitemexporter将我的已删除数据导出到xml文件。但我只在xml文件中得到“<”/ item“”>“。 我的items.py如下:
from scrapy.item import Item, Field
class WorkwithitemsItem(Item):
title = Field()
link = Field()
publish = Field()
description = Field()
蜘蛛就像:
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from workwithitems.items import WorkwithitemsItem
class MySpider(BaseSpider):
name = 'spidey'
allowed_domains = ['ekantipur.com']
start_urls = [
'http://www.ekantipur.com/en/rss',
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
link = sel.xpath('//link/text()').extract()
publish = sel.xpath('//pubDate/text()').extract()
description = sel.xpath('//description/text()').extract()
WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:])
pipelines.py是:
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
settings.py是:
BOT_NAME = 'workwithitems'
SPIDER_MODULES = ['workwithitems.spiders']
NEWSPIDER_MODULE = 'workwithitems.spiders'
FEED_EXPORTERS_BASE = {
'xml': 'scrapy.contrib.exporter.XmlItemExporter',
}
ITEM_PIPELINES = {
'workwithitems.pipelines.XmlExportPipeline': 800,
}
我无法弄清楚我的问题在哪里。
答案 0 :(得分:0)
确定!我发现了这个问题。我所做的只是在spider.py
的最后一行放一个'return'return WorkwithitemsItem(title = title[2:], link = link[2:],
publish = publish, description = description[1:]
)