我正在尝试使用Scrapy将已删除的项目导出到CSV字段中,每个字段都用双引号括起来。目前,CSV导出正确,但是当我尝试修改项目字段并手动添加双引号时,CSV最终会将每个字段括在三重双引号中。这是我正在尝试做的一个例子:
import scrapy
from tutorial.items import StoreItem
class SecilSpider(scrapy.Spider):
name = "secil"
allowed_domains = ["secilstore.com"]
def start_requests(self):
start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
return [ scrapy.Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
item = StoreItem()
for url in response.xpath('//div[@class="image"]/a/@href').extract():
yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
baseUrl = response.request.headers.get('Referer', None)
if baseUrl is not None:
baseUrl = baseUrl.split('Sayfa')[0]
color = response.xpath('//a[@class="renk"]/text()').extract()
for c in color:
item['url'] = baseUrl
item['productUrl'] = response.url
item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[@id="productMainImage"]/@src').extract()[0]
item['color'] = c
item['price'] = response.xpath('//span[@class="price cufonHover"]/text()').extract()[0] + "TL"
item['title'] = response.xpath('//h2[@class="cufon"]/text()').extract()
item['brand'] = response.xpath('//h3[@class="slogan cufonSemi"]/text()').extract()[0]
size = '|'.join(s.strip() for s in response.xpath('//a[@class="inStock"]/text()').extract())
item['size'] = size if size else -1
oldPrice = response.xpath('//div[@class="indirimFiyat"]/text()').extract()
item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
items.append(item)
yield item
class CSVPipeline(object):
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, False,'"')
self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
所以,当我尝试修改蜘蛛中的一个字段并手动添加双引号时(fpr示例,对于item ['url']):
item['url'] = '"%s"' % baseUrl
生成的CSV打印出以下内容:
"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1
如您所见,第一个字段被三重双引号括起来,而不是只有一个。另外有趣的是价格用双引号打印。如何只用一对双引号来围绕每个字段?
谢谢!
答案 0 :(得分:3)
我通过修改CSVItemPipeline找到了它:
self.exporter = CsvItemExporter(open(spider.name+".csv", "w"), False,
fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)
这允许我生成一个CSV文件,其中的字段用双引号。