我正在尝试从scrapy中的url列表中为每个抓取的url生成一个csv文件。我明白我会修改pipeline.py,但到目前为止我所有的尝试都失败了。我不明白如何将被抓取的URL传递给管道并将其用作输出的名称并相应地拆分输出。
任何帮助?
由于
这里是蜘蛛和管道
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class VappPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('results/%s.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.fields_to_export = ['item']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
管道:
{{1}}
答案 0 :(得分:0)
我认为当你的抓取完成而不是按项目时,你应该批量处理所有这些事情作为后处理步骤,但这里有关于你如何做你想做的草案:
from scrapy import Spider
from scrapy.selector import Selector
from vApp.items import fItem
class VappSpider(Spider):
name = "vApp"
allowed_domains = ["google.co.uk"]
start_urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
def parse(self, response):
trs = Selector(response).xpath('//[@id="incdiv"]/table/tbody/tr')
for tr in trs:
item = fItem()
try:
item['item'] = tr.xpath('td/text()').extract()[0]
except IndexError:
item['item'] = 'null'
item['url'] = response.url
yield item
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
from urlparse import urlparse
class VappPipeline(object):
def __init__(self):
self.files = {}
self.exporter = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def process_item(self, item, spider):
url = item['url']
parsed_uri = urlparse(url)
domain = parsed_uri.netloc
if domain not in self.exporter:
file = open('results/%s.csv' % domain, 'w+b')
self.files[domain] = file
self.exporter[domain] = CsvItemExporter(file)
self.exporter[domain].fields_to_export = ['item']
self.exporter[domain].start_exporting()
assert domain in self.exporter
self.exporter[domain].export_item(item)
return item
def spider_closed(self, spider):
for domain, exporter in self.exporter.iteritems():
exporter.finish_exporting()
self.files[domain].close()