这是蜘蛛:
import scrapy
from danmurphys.items import DanmurphysItem
class MySpider(scrapy.Spider):
name = 'danmurphys'
allowed_domains = ['danmurphys.com.au']
start_urls = ['https://www.danmurphys.com.au/dm/navigation/navigation_results_gallery.jsp?params=fh_location%3D%2F%2Fcatalog01%2Fen_AU%2Fcategories%3C%7Bcatalog01_2534374302084767_2534374302027742%7D%26fh_view_size%3D120%26fh_sort%3D-sales_value_30_days%26fh_modification%3D&resetnav=false&storeExclusivePage=false']
def parse(self, response):
urls = response.xpath('//h2/a/@href').extract()
for url in urls:
request = scrapy.Request(url , callback=self.parse_page)
yield request
def parse_page(self , response):
item = DanmurphysItem()
item['brand'] = response.xpath('//span[@itemprop="brand"]/text()').extract_first().strip()
item['name'] = response.xpath('//span[@itemprop="name"]/text()').extract_first().strip()
item['url'] = response.url
return item
以下是项目:
import scrapy
class DanmurphysItem(scrapy.Item):
brand = scrapy.Field()
name = scrapy.Field()
url = scrapy.Field()
当我用这个命令运行蜘蛛时:
scrapy crawl danmurphys -o output.csv
答案 0 :(得分:9)
要在Scrapy 1.3中修复此问题,您可以在newline=''
类的io.TextIOWrapper
方法的__init__
方法中将CsvItemExporter
作为参数添加到scrapy.exporters
来修补此问题。 }。
答案 1 :(得分:1)
此输出显示在Windows上使用"w"
模式打开的csv文件句柄的典型症状(可能修复Python 3兼容性)但省略newline
。
虽然这对基于Linux / Unix的系统没有影响,但在Windows上,会发出2个回车字符,在每个数据行后插入一个假空白行。
with open("output.csv","w") as f:
cr = csv.writer(f)
正确的做法(python 3):
with open("output.csv","w",newline='') as f: # python 3
cr = csv.writer(f)
(在python 2中,将"wb"
设置为打开模式修复它)
如果文件是由您不能或不想修改的程序创建的,您可以按如下方式对文件进行后处理:
with open("output.csv","rb") as f:
with open("output_fix.csv","w") as f2:
f2.write(f.read().decode().replace("\r","")) # python 3
f2.write(f.read().replace("\r","")) # python 2
答案 2 :(得分:1)
我设法按照以下步骤解决这个问题:
call function 'TR_ORDER_CHOICE_CORRECTION'
exporting
iv_category = 'CUST'
importing
ev_order = ev_request
ev_task = ev_task
exceptions
invalid_category = 1
no_correction_selected = 2
others = 3.
call function 'TR_OBJECTS_CHECK'
exporting
iv_no_show_option = abap_true
tables
wt_ko200 = lt_ko200_customizing
wt_e071k = lt_e071k_customizing
exceptions
cancel_edit_other_error = 1
show_only_other_error = 2
others = 3.
call function 'TR_OBJECTS_INSERT'
exporting
wi_order = lv_request
iv_no_show_option = abap_true
tables
wt_ko200 = lt_ko200_customizing
wt_e071k = lt_e071k_customizing
exceptions
cancel_edit_other_error = 1
show_only_other_error = 2
others = 3.
C:.
| scrapy.cfg
|
\---my_scraper
| exporters.py
| items.py
| middlewares.py
| pipelines.py
| settings.py
| __init__.py
|
+---spiders
| | my_spider.py
| | __init__.py
|
# -*- coding: utf-8 -*-
import csv
import io
import os
import six
from scrapy.conf import settings
from scrapy.exporters import CsvItemExporter
from scrapy.extensions.feedexport import IFeedStorage
from w3lib.url import file_uri_to_path
from zope.interface import implementer
@implementer(IFeedStorage)
class FixedFileFeedStorage(object):
def __init__(self, uri):
self.path = file_uri_to_path(uri)
def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
return open(self.path, 'ab')
def store(self, file):
file.close()
class MyCsvItemExporter(CsvItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
# Custom delimiter
delimiter = settings.get('CSV_DELIMITER', ';')
kwargs['delimiter'] = delimiter
super(MyCsvItemExporter, self).__init__(file, include_headers_line, join_multivalued, **kwargs)
self._configure(kwargs, dont_fail=True)
self.stream.close()
storage = FixedFileFeedStorage(file.name)
file = storage.open(file.name)
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline="",
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
我希望这对您有帮助
答案 3 :(得分:0)
特别感谢所有人(Jean-François)
问题是我安装了另一个从conda安装的scrapy版本1.1.0用于python 3.5 一旦我在系统路径中添加了python 2.7,原始的scrapy 1.1.2就会恢复正常工作。 一切正常。
答案 4 :(得分:0)
我通过pipelines.py文件解决了它:
我怀疑不理想,但我找到了解决这个问题的方法。在pipelines.py文件中,我添加了更多基本上用空行读取csv文件到代码列表的代码,因此删除空白行,然后将清理后的列表写入新文件。
我添加的代码是:
with open('%s_items.csv' % spider.name, 'r') as f:
reader = csv.reader(f)
original_list = list(reader)
cleaned_list = list(filter(None,original_list))
with open('%s_items_cleaned.csv' % spider.name, 'w', newline='') as output_file:
wr = csv.writer(output_file, dialect='excel')
for data in cleaned_list:
wr.writerow(data)
因此,整个pipelines.py文件的详细信息为Scrapy python csv output has blank lines between each row