我写了一个蜘蛛,它抓取csv文档提供的页面中的内容和图像。但在某些情况下,python向我提供了“由同行重置连接”,这可以防止我的蜘蛛抓住其他页面。
我是python的新手,我想知道如何忽略或重试收到此消息的请求,因此不会影响抓取。
这是我的班级:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector, Selector
from wearus.items import ProductItem
import re
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy import log
from scrapy.http import Request
import urllib, csv
class dummySpider(BaseSpider):
name = "dummy"
domain = "dummy.com"
parsed_hostnames = set()
feed_url = "http://url.to/file.csv"
start_urls = []
allowed_domains = ['dummy.com' , domain]
url_col_nr = 12
handle_httpstatus_list = [404, 503]
failed_urls = []
stats = None
def __init__(self, **kwargs):
dispatcher.connect(self.spider_closed, signals.spider_closed)
# @classmethod
# def from_crawler(cls, crawler):
# return cls(crawler.stats)
def start_requests(self):
data = urllib.urlopen(self.feed_url)
reader = csv.reader(data)
log.msg("{0} items in csv file".format(len(list(csv.reader(urllib.urlopen(self.feed_url))))), level=log.INFO)
next(reader)
for row in reader:
url = row[self.url_col_nr].strip()
row_id = row[0].strip()
# filter url
match = re.search('\[\[(.*?)\/', url)
if(match.group(1)):
url = "http://" + self.domain + "/" + match.group(1)
if (url.strip() != ""):
item = ProductItem()
item['prod_id'] = row_id
request = self.make_requests_from_url(url)
request.meta['item'] = item
yield request
def parse(self, response):
print response
if self.stats is None:
self.stats = self.crawler.stats
if response.status in self.handle_httpstatus_list:
self.stats.inc_value('failed_url_count')
self.failed_urls.append(response.url)
# print response.url
return
hxs = HtmlXPathSelector(response)
sel = Selector(response)
i = response.meta['item']
name = hxs.select('//h1[@class="product-detail-product-options"]/text()').extract()
if (len(name) > 0): i['name'] = name[0]
cats = sel.css('ul.zzg-breadcrumb>li a::text').extract()
def cat_replace(item):
return item.replace(' >', '')
def cat_filter(item):
return item != 'Ana Sayfa'
i['category'] = map(cat_replace, cats)
i['category'] = filter(cat_filter, i['category'])
i['url'] = response.url
color = sel.css('.product-detail-box > table tr:first-child > td:first-child::text').extract()
if (len(color) > 0): i['color'] = color[0]
i['image_urls'] = sel.css('.product-detail-product-image a img::attr(src)').extract()
return i
def parse_error(self,failure):
#print 'Error encontrado'
# self.processed_urls = self.processed_urls +1
# self.url_error = self.url_error + 1
yield Request(self.start_urls.next(),callback=self.parse,errback=self.parse_error)
def spider_closed(self, spider):
if self.stats:
self.stats.set_value('failed_urls', ','.join(spider.failed_urls))
这就是错误信息:
ERROR: Obtaining request from start requests
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1192, in run
self.mainLoop()
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1201, in mainLoop
self.runUntilCurrent()
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/pymodules/python2.7/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/core/engine.py", line 111, in _next_request
request = next(slot.start_requests)
File "build/bdist.macosx-10.8-x86_64/egg/dummy/spiders/dummy.py", line 42, in start_requests
File "/usr/lib/python2.7/socket.py", line 530, in next
line = self.readline()
File "/usr/lib/python2.7/socket.py", line 430, in readline
data = recv(1)
socket.error: [Errno 104] Connection reset by peer