Scrapy蜘蛛连接由同行重置

时间:2014-01-10 15:58:43

标签: python web-crawler screen-scraping scrapy

我写了一个蜘蛛,它抓取csv文档提供的页面中的内容和图像。但在某些情况下,python向我提供了“由同行重置连接”,这可以防止我的蜘蛛抓住其他页面。

我是python的新手,我想知道如何忽略或重试收到此消息的请求,因此不会影响抓取。

这是我的班级:

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector, Selector
from wearus.items import ProductItem
import re
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy import log
from scrapy.http import Request

import urllib, csv


class dummySpider(BaseSpider):
    name = "dummy"
    domain = "dummy.com"

    parsed_hostnames = set()
    feed_url = "http://url.to/file.csv"
    start_urls = []
    allowed_domains = ['dummy.com' , domain]
    url_col_nr = 12

    handle_httpstatus_list = [404, 503]
    failed_urls = []

    stats = None

    def __init__(self, **kwargs):
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    # @classmethod
    # def from_crawler(cls, crawler):
    #     return cls(crawler.stats)

    def start_requests(self):
      data = urllib.urlopen(self.feed_url)
      reader = csv.reader(data)

      log.msg("{0} items in csv file".format(len(list(csv.reader(urllib.urlopen(self.feed_url))))), level=log.INFO)

      next(reader)
      for row in reader:
          url = row[self.url_col_nr].strip()
          row_id = row[0].strip()

          # filter url
          match = re.search('\[\[(.*?)\/', url)

          if(match.group(1)):
            url = "http://" + self.domain + "/" + match.group(1)

          if (url.strip() != ""):
              item = ProductItem()
              item['prod_id'] = row_id
              request = self.make_requests_from_url(url)
              request.meta['item'] = item
              yield request



    def parse(self, response):
        print response

        if self.stats is None:
            self.stats = self.crawler.stats

        if response.status in self.handle_httpstatus_list:
            self.stats.inc_value('failed_url_count')
            self.failed_urls.append(response.url)
            # print response.url
            return

        hxs = HtmlXPathSelector(response)
        sel = Selector(response)
        i = response.meta['item']
        name =  hxs.select('//h1[@class="product-detail-product-options"]/text()').extract()
        if (len(name) > 0): i['name'] = name[0]

        cats = sel.css('ul.zzg-breadcrumb>li a::text').extract()

        def cat_replace(item):
              return item.replace(' >', '')


        def cat_filter(item):
              return  item != 'Ana Sayfa'

        i['category'] = map(cat_replace, cats)
        i['category'] = filter(cat_filter, i['category'])

        i['url'] = response.url
        color = sel.css('.product-detail-box > table tr:first-child > td:first-child::text').extract()
        if (len(color) > 0): i['color'] = color[0]

        i['image_urls'] = sel.css('.product-detail-product-image a img::attr(src)').extract()


        return i

    def parse_error(self,failure):
        #print 'Error encontrado'

        # self.processed_urls = self.processed_urls +1
        # self.url_error = self.url_error + 1

        yield Request(self.start_urls.next(),callback=self.parse,errback=self.parse_error)

    def spider_closed(self, spider):
        if self.stats:
            self.stats.set_value('failed_urls', ','.join(spider.failed_urls))

这就是错误信息:

ERROR: Obtaining request from start requests
  Traceback (most recent call last):
    File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1192, in run
      self.mainLoop()
    File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1201, in mainLoop
      self.runUntilCurrent()
    File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
      call.func(*call.args, **call.kw)
    File "/usr/lib/pymodules/python2.7/scrapy/utils/reactor.py", line 41, in __call__
      return self._func(*self._a, **self._kw)
  --- <exception caught here> ---
    File "/usr/lib/pymodules/python2.7/scrapy/core/engine.py", line 111, in _next_request
      request = next(slot.start_requests)
    File "build/bdist.macosx-10.8-x86_64/egg/dummy/spiders/dummy.py", line 42, in start_requests

    File "/usr/lib/python2.7/socket.py", line 530, in next
      line = self.readline()
    File "/usr/lib/python2.7/socket.py", line 430, in readline
      data = recv(1)
  socket.error: [Errno 104] Connection reset by peer  

0 个答案:

没有答案