您好我是Python和Scrapy的新手,我正在尝试编写Spider代码,但在处理规则链接时我无法找到错误或错误解决方案。
我不知道编码或相对路径或其他事情是否有问题。
当我运行脚本时,启动网址有94个项目链接,你可以看到我有94个“spider_exceptions / ValueError”
错误代码:
ValueError: All strings must be XML compatible: Unicode or ASCII, no
NULL bytes or control characters 2017-07-17 16:12:49
[scrapy.core.scraper] ERROR: Spider error processing <GET
https://subastas.boe.es/detalleSubasta.php?idSub=SUB-JA-2017-68197&idBus=_VDFMQktMNXdpU0loK3B1UjZhMzhzUHdTUmdiTW9DNjBhM3lkMWpZWDBGbXdtOEVmWW13VmlhSC8vQUR5V1RNRjY0NWhVcjd2aDRMbkVyMkFLbmN4Ym0wc1E4eHVHWHlxSURJSTVBeGhzNGFIRzNkOUpBbW9SRG5RZExsbUNNeFFORSs1R21vaEJIeVhrMkdKdGRYUzg5N1laT2NPUTBwYUI0SVlHTm8vRkF4UEpleHE0b2U2MmZTdFhvZlIyUzgyemg0ekhOSEVoWEtuaVFMbXdBei92MytWaXNhWGtUTVd4SDJZUk9KUUJpVnExa01TeUhOcGZFQ1JqZDIxVU9BTWpHMGJVRU9rNmljVVN4UFFkNUp4SG1FR3dYWGlrVGgxWVJnWkRIQVJXZWxadVRpYWRUcm81WUgxeW4xb3RxQWJXV3JSNUl1N0NYZFoyVlhDaldGWU5RPT0,>
(referer:
https://subastas.boe.es/subastas_ava.php?campo%5B0%5D=SUBASTA.ORIGEN&dato%5B0%5D=&campo%5B1%5D=SUBASTA.ESTADO&dato%5B1%5D=EJ&campo%5B2%5D=BIEN.TIPO&dato%5B2%5D=I&dato%5B3%5D=501&campo%5B4%5D=BIEN.DIRECCION&dato%5B4%5D=&campo%5B5%5D=BIEN.CODPOSTAL&dato%5B5%5D=&campo%5B6%5D=BIEN.LOCALIDAD&dato%5B6%5D=&campo%5B7%5D=BIEN.COD_PROVINCIA&dato%5B7%5D=28&campo%5B8%5D=SUBASTA.POSTURA_MINIMA_MINIMA_LOTES&dato%5B8%5D=&campo%5B9%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_1&dato%5B9%5D=&campo%5B10%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_2&dato%5B10%5D=&campo%5B11%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_3&dato%5B11%5D=&campo%5B12%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_4&dato%5B12%5D=&campo%5B13%5D=SUBASTA.NUM_CUENTA_EXPEDIENTE_5&dato%5B13%5D=&campo%5B14%5D=SUBASTA.ID_SUBASTA_BUSCAR&dato%5B14%5D=&campo%5B15%5D=SUBASTA.FECHA_FIN_YMD&dato%5B15%5D%5B0%5D=&dato%5B15%5D%5B1%5D=&campo%5B16%5D=SUBASTA.FECHA_INICIO_YMD&dato%5B16%5D%5B0%5D=&dato%5B16%5D%5B1%5D=&page_hits=1000&sort_field%5B0%5D=SUBASTA.FECHA_FIN_YMD&sort_order%5B0%5D=desc&sort_field%5B1%5D=SUBASTA.FECHA_FIN_YMD&sort_order%5B1%5D=asc&sort_field%5B2%5D=SUBASTA.HORA_FIN&sort_order%5B2%5D=asc&accion=Buscar)
和
2017-07-17 16:12:49 [scrapy.core.engine] INFO: Closing spider (finished)
2017-07-17 16:12:49 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 176632,
'downloader/request_count': 95,
'downloader/request_method_count/GET': 95,
'downloader/response_bytes': 1279009,
'downloader/response_count': 95,
'downloader/response_status_count/200': 95,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 7, 17, 14, 12, 49, 900000),
'log_count/DEBUG': 96,
'log_count/ERROR': 94,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 95,
'scheduler/dequeued': 95,
'scheduler/dequeued/memory': 95,
'scheduler/enqueued': 95,
'scheduler/enqueued/memory': 95,
'spider_exceptions/ValueError': 94,
'start_time': datetime.datetime(2017, 7, 17, 14, 12, 46, 66000)}
2017-07-17 16:12:49 [scrapy.core.engine] INFO: Spider closed (finished)
代码:
Spyder.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spider import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from boe.items import boeItem
class boeSpider(CrawlSpider):
name = 'boe'
item_count = 0
allowed_domain = ['https://subastas.boe.es']
start_urls = ['https://subastas.boe.es/subastas_ava.php?campo[0]=SUBASTA.ORIGEN&dato[0]=&campo[1]=SUBASTA.ESTADO&dato[1]=EJ&campo[2]=BIEN.TIPO&dato[2]=I&dato[3]=501&campo[4]=BIEN.DIRECCION&dato[4]=&campo[5]=BIEN.CODPOSTAL&dato[5]=&campo[6]=BIEN.LOCALIDAD&dato[6]=&campo[7]=BIEN.COD_PROVINCIA&dato[7]=28&campo[8]=SUBASTA.POSTURA_MINIMA_MINIMA_LOTES&dato[8]=&campo[9]=SUBASTA.NUM_CUENTA_EXPEDIENTE_1&dato[9]=&campo[10]=SUBASTA.NUM_CUENTA_EXPEDIENTE_2&dato[10]=&campo[11]=SUBASTA.NUM_CUENTA_EXPEDIENTE_3&dato[11]=&campo[12]=SUBASTA.NUM_CUENTA_EXPEDIENTE_4&dato[12]=&campo[13]=SUBASTA.NUM_CUENTA_EXPEDIENTE_5&dato[13]=&campo[14]=SUBASTA.ID_SUBASTA_BUSCAR&dato[14]=&campo[15]=SUBASTA.FECHA_FIN_YMD&dato[15][0]=&dato[15][1]=&campo[16]=SUBASTA.FECHA_INICIO_YMD&dato[16][0]=&dato[16][1]=&page_hits=1000&sort_field[0]=SUBASTA.FECHA_FIN_YMD&sort_order[0]=desc&sort_field[1]=SUBASTA.FECHA_FIN_YMD&sort_order[1]=asc&sort_field[2]=SUBASTA.HORA_FIN&sort_order[2]=asc&accion=Buscar']
rules = {
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ("//a[contains(@class,'resultado-busqueda-link-defecto')]")),
callback = 'parse_item', follow = False)
}
def parse_item(self, response):
DATAQ = boeItem()
#info de General
DATAQ['Gen_Id'] = response.xpath('//th[text()="Identificador"]/following-sibling::td[1]/strong/text()').extract_first()
DATAQ['Gen_Tipo'] = response.xpath('//th[text()="Tipo de subasta"]/following-sibling::td[1]/strong/text()').extract()
DATAQ['Gen_Inicio'] = response.xpath('//th[text()="Fecha de inicio"]/following-sibling::td[1]/span/text()').extract()
DATAQ['Gen_Fin'] = response.xpath('//th[text()="Fecha de conclusión"]/following-sibling::td[1]/span/text()').extract()
DATAQ['Gen_Deuda'] = response.xpath('//th[text()="Cantidad reclamada"]/following-sibling::td[1]/text()').extract()
DATAQ['Gen_Lotes'] = response.xpath('//th[text()="Lotes"]/following-sibling::td[1]/text()').extract()
DATAQ['Gen_Anuncio'] = response.xpath('//th[text()="Anuncio BOE"]/following-sibling::td[1]/a/@href').extract()
DATAQ['Gen_Valor'] = response.xpath('//th[text()="Valor subasta"]/following-sibling::td[1]/text()').extract()
DATAQ['Gen_Tasacion'] = response.xpath('//th[text()="Tasación"]/following-sibling::td[1]/text()').extract()
DATAQ['Gen_Minimo'] = response.xpath('//th[text()="Puja mínima"]/following-sibling::td[1]/text()').extract_first()
DATAQ['Gen_Tramos'] = response.xpath('//th[text()="Tramos entre pujas"]/following-sibling::td[1]/text()').extract_first()
DATAQ['Gen_Deposito'] = response.xpath('//th[text()="Importe del depósito"]/following-sibling::td[1]/text()').extract()
self.item_count += 1
if self.item_count > 10:
raise CloseSpider('item_exceeded')
yield DATAQ
Settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for boe project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'boe'
SPIDER_MODULES = ['boe.spiders']
NEWSPIDER_MODULE = 'boe.spiders'
#CSV IMPORTACION
ITEM_PIPELINES = {'boe.pipelines.boePipeline': 500, }
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'boe (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
提前谢谢。