这是蜘蛛
#-*-coding:utf-8-*-
import scrapy
from scrapy_test.items import ScrapyTestItem
class QiushiSpider(scrapy.Spider):
name = "qiushibaike"
start_urls = {
"http://www.qiushibaike.com",
}
def parse(self, response):
item = ScrapyTestItem()
for temp in response.xpath(".//div[@id='content']//div[@id='content-left']/div"):
# a = str(map(str, ''.join(temp.xpath(".//div[@class='content']/span/text()").extract())))
item['text'] = temp.xpath(".//div[@class='content']/span/text()").extract()
item['number'] = temp.xpath(".//div[@class='stats']/span[@class='stats-vote']/i/text()").extract()
yield item
这是保存数据的管道
import json
from scrapy.exceptions import DropItem
class ScrapyTestPipeline(object):
def __init__(self):
self.ids_seen = set()
self.file = open("aaa.jl", "w",encoding='utf8' )
def process_item(self, item, spider):
item['id'] = hash(''.join(item['text']))
if item['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %s " % item)
else:
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
我试图保存数据,但数据编码出错。我试图以其他方式进行,但我失败了。我该怎么办?
这是错误数据 " text":[" \ n \ n \ nlz \ u7537 \ uff0c \ u4eca \ u5929 \ u8ddf \ u670b \ u53cb \ u53bbktv \ u5531 \ u6b4c \ uff01]
答案 0 :(得分:0)
你是如何运行刮刀的?您是否使用scrapy的默认导出器将数据保存为JSON / CSV?
在settings.py
FEED_EXPORT_ENCODING = 'utf-8'
答案 1 :(得分:0)
#-*-coding:utf-8-*-
import scrapy
from scrapy_test.items import ScrapyTestItem
import re
class QiushiSpider(scrapy.Spider):
name = "qiushibaike"
start_urls = {
"http://www.qiushibaike.com",
}
def parse(self, response):
item = ScrapyTestItem()
for temp in response.xpath(".//div[@id='content']//div[@id='content-left']/div"):
a = temp.xpath(".//div[@class='content']/span")
id = temp.xpath('.//@id').extract_first('')
item['id'] = re.search('[0-9]*$', id).group(0)
item['text'] = (''.join(a.xpath("string(.)").extract())).strip().encode('utf-8')
item['number'] = temp.xpath(".//div[@class='stats']/span[@class='stats-vote']/i/text()").extract_first('')
yield item
这是管道
# -*- coding: utf-8 -*-
from scrapy.exceptions import DropItem
class ScrapyTestPipeline(object):
def __init__(self):
self.ids_seen = set()
self.file = "liu.jl"
def process_item(self, item, spider):
if item['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %s " % item)
else:
self.ids_seen.add(item['id'])
with open(self.file, 'a',encoding='utf-8') as f:
f.write('text: '+item['text'].decode() + '\n')
f.write('number '+item['number'] + '\n')
f.write('id '+item['id'] + '\n\n')
return item