我只是爬到websit.but重定向anthor页面。在蜘蛛中我添加了
handle_httpstatus_list = [302,301]
并覆盖start_requests
方法。但问题是
AttributeError: 'Response' object has no attribute 'xpath'
蜘蛛代码:
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
import scrapy
import time
class Car51Spider (CrawlSpider):
name = 'car51'
allowed_domains = ['51auto.com']
start_urls = ['http://www.51auto.com/quanguo/pabmdcigf?searchtype=searcarlist&curentPage=1&isNewsCar=0&isSaleCar=0&isQa=0&orderValue=record_time']
rules = [Rule(LinkExtractor(allow=('/pabmdcigf?searchtype=searcarlist&curentPage=\d+\&isNewsCar\=0\&isSaleCar\=0\&isQa\=0\&orderValue\=record_time')),callback='parse_item',follow=True)] #//页面读取策略
handle_httpstatus_list = [302,301]
items = {}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True, callback=self.parse_item)
def parse_item(self,response):
trs = response.xpath("//div[@class='view-grid-overflow']/a").extract()
for tr in trs:
sales_1 = u''
item = Car58Item()
urls = tr.xpath("a/@href").extract_first()
item['url'] = tr.xpath("a/@href").extract_first()
item['tip'] = tr.xpath("a/ul/li[@class='title']/text()").extract_first()
item['name'] = tr.xpath("a/ul/li[@class='title']/text()").extract_first()
sales_times = tr.xpath("a/ul/li[@class='info']/span/text()").extract()
for x in sales_times:
sales_1 = sales_1 + x
item['sales_time'] = sales_1
item['region'] = tr.xpath("a/ul/li[@class='info']/span[@class='font-color-red']/text()").extract_first()
item['amt'] = tr.xpath("a/ul/li[@class='price']/div[1]/text()").extract_first()
yield scrapy.Request(url=urls,callback=self.parse_netsted_item,meta={'item':item})
def parse_netsted_item(self,response):
dh = u''
dha = u''
mode = response.xpath("//body")
item = Car58Item(response.meta['item'])
dhs = mode.xpath("//div[@id='contact-tel1']/p/text()").extract()
for x in dhs:
dh = dh + x
item['lianxiren_dh'] = dh
lianxiren = mode.xpath("//div[@class='section-contact']/text()").extract()
item['lianxiren'] = lianxiren[1]
item['lianxiren_dz'] = lianxiren[2]
item['details'] = mode.xpath("//div[@id='car-dangan']").extract()
desc = mode.xpath("//div[@class='car-detail-container']/p/text()").extract()
for d in desc:
dha = dha + d
item['description'] = dha
item['image_urls'] = mode.xpath("//div[@class='car-pic']/img/@src").extract()
item['collection_dt'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
return item
settting.py
# -*- coding: utf-8 -*-
# Scrapy settings for car project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'car'
SPIDER_MODULES = ['car.spiders.car51']
#NEWSPIDER_MODULE = 'car.spiders.zhaoming'
DEFAULT_ITEM_CLASS = 'car.items.Car58Item'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1,
'car.pipelines.MongoDBPipeline': 300,
'car.pipelines.Car58ImagesPipeline': 301
}
MONGODB_SERVER ="localhost"
MONGODB_PORT=27017
MONGODB_DB="car"
MONGODB_COLLECTION_CAR="car"
MONGODB_COLLECTION_ZHAOMING="zhaoming"
IMAGES_STORE = "img/"
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
IMAGES_EXPIRES = 90
DOWNLOAD_TIMEOUT=10
LOG_ENABLED=True
LOG_ENCODING='utf-8'
LOG_LEVEL="DEBUG"
LOGSTATS_INTERVAL=5
# LOG_FILE='/tmp/scrapy.log'
CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
scrapy log:
$scrapy crawl car51
2016-06-14 14:18:38 [scrapy] INFO: Scrapy 1.1.0 started (bot: car)
2016-06-14 14:18:38 [scrapy] INFO: Overridden settings: {'CONCURRENT_REQUESTS_PER_DOMAIN': 16, 'SPIDER_MODULES': ['car.spiders.car51'], 'BOT_NAME': 'car', 'DOWNLOAD_TIMEOUT': 10, 'LOGSTATS_INTERVAL': 5, 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:35.0) Gecko/20100101 Firefox/35.0', 'DEFAULT_ITEM_CLASS': 'car.items.Car58Item', 'DOWNLOAD_DELAY': 0.25}
2016-06-14 14:18:38 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2016-06-14 14:18:38 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-06-14 14:18:38 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-06-14 14:18:38 [py.warnings] WARNING: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/deprecate.py:156: ScrapyDeprecationWarning: `scrapy.contrib.pipeline.images.ImagesPipeline` class is deprecated, use `scrapy.pipelines.images.ImagesPipeline` instead
ScrapyDeprecationWarning)
2016-06-14 14:18:38 [py.warnings] WARNING: /Users/mayuping/PycharmProjects/car/car/pipelines.py:13: ScrapyDeprecationWarning: Module `scrapy.log` has been deprecated, Scrapy now relies on the builtin Python library for logging. Read the updated logging entry in the documentation to learn more.
from scrapy import log
2016-06-14 14:18:38 [scrapy] INFO: Enabled item pipelines:
['scrapy.pipelines.images.ImagesPipeline',
'car.pipelines.MongoDBPipeline',
'car.pipelines.Car58ImagesPipeline']
2016-06-14 14:18:38 [scrapy] INFO: Spider opened
2016-06-14 14:18:38 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-06-14 14:18:38 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-06-14 14:18:38 [scrapy] DEBUG: Crawled (302) <GET http://www.51auto.com/quanguo/pabmdcigf?searchtype=searcarlist&curentPage=1&isNewsCar=0&isSaleCar=0&isQa=0&orderValue=record_time> (referer: None)
**2016-06-14 14:18:39 [scrapy] ERROR: Spider error processing <GET http://www.51auto.com/quanguo/pabmdcigf?searchtype=searcarlist&curentPage=1&isNewsCar=0&isSaleCar=0&isQa=0&orderValue=record_time> (referer: None)**
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/mayuping/PycharmProjects/car/car/spiders/car51.py", line 22, in parse_item
trs = response.xpath("//div[@class='view-grid-overflow']/a").extract()
AttributeError: 'Response' object has no attribute 'xpath'
2016-06-14 14:18:39 [scrapy] INFO: Closing spider (finished)
2016-06-14 14:18:39 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 351,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 420,
'downloader/response_count': 1,
'downloader/response_status_count/302': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 6, 14, 6, 18, 39, 56461),
'log_count/DEBUG': 2,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'log_count/WARNING': 2,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/AttributeError': 1,
'start_time': datetime.datetime(2016, 6, 14, 6, 18, 38, 437336)}
2016-06-14 14:18:39 [scrapy] INFO: Spider closed (finished)
答案 0 :(得分:1)
当你添加handle_httpstatus_list = [302,301]
时,你告诉Scrapy甚至为HTTP重定向调用你的回调,而不是让框架为你透明地处理重定向(这是默认的)。
重定向的某些HTTP响应没有主体或内容标头,因此在这些情况下,在您的回调中,Scrapy按原样递交响应,即普通Response
对象,而不是{{1您拥有HtmlResponse
和.xpath()
个快捷方式。
您真的需要处理HTTP 301和302响应,并且您需要编写回调以便它测试状态代码(.css()
),仅在非3xx情况下提取数据,
或者,您让Scrapy为您处理HTTP重定向,您需要删除蜘蛛中的response.status
。