下载scrapy图像时出错

时间:2014-05-14 13:42:30

标签: python scrapy web-crawler

我有scrapy spider来从某些电子商务网站获取图片和内容。现在我想下载图像,我写了几个代码,但我收到了这个错误:

..

          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr
            rep = repr(object)
          File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__
            return pformat(dict(self))
          File "/usr/lib/python2.7/pprint.py", line 63, in pformat
            return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)
          File "/usr/lib/python2.7/pprint.py", line 122, in pformat
            self._format(object, sio, 0, 0, {}, 0)
          File "/usr/lib/python2.7/pprint.py", line 140, in _format
            rep = self._repr(object, context, level - 1)
          File "/usr/lib/python2.7/pprint.py", line 226, in _repr
            self._depth, level)
          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr
            vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr
            rep = repr(object)
          File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__
            return pformat(dict(self))
          File "/usr/lib/python2.7/pprint.py", line 63, in pformat
            return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object)
          File "/usr/lib/python2.7/pprint.py", line 122, in pformat
            self._format(object, sio, 0, 0, {}, 0)
          File "/usr/lib/python2.7/pprint.py", line 140, in _format
            rep = self._repr(object, context, level - 1)
          File "/usr/lib/python2.7/pprint.py", line 226, in _repr
            self._depth, level)
          File "/usr/lib/python2.7/pprint.py", line 238, in format
            return _safe_repr(object, context, maxlevels, level)
          File "/usr/lib/python2.7/pprint.py", line 280, in _safe_repr
            for k, v in _sorted(object.items()):
          File "/usr/lib/python2.7/pprint.py", line 78, in _sorted
            with warnings.catch_warnings():
        exceptions.RuntimeError: maximum recursion depth exceeded

我的spider

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request

from loom.items import LoomItem
import sys


from scrapy.contrib.loader import XPathItemLoader

from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class LoomSpider(CrawlSpider):
    name = "loom_org"
    allowed_domains = ["2loom.com"]
    start_urls = [
        "http://2loom.com",
        "http://2loom.com/collections/basic",
        "http://2loom.com/collections/design",
        "http://2loom.com/collections/tum-koleksiyon"
    ]

    rules = [
           Rule(SgmlLinkExtractor(allow='products'), callback='parse_items',follow = True),
           Rule(SgmlLinkExtractor(allow=()), follow=True),
       ]    

    def parse_items(self, response):
        sys.setrecursionlimit(10000)        

        item = LoomItem()

        items = []
        sel = Selector(response)
        name = sel.xpath('//h1[@itemprop="name"]/text()').extract()
        brand = "2loom"
        price_lower = sel.xpath('//h1[@class="product-price"]/text()').extract()
        price = "0"
        image = sel.xpath('//meta[@property="og:image"]/@content').extract()
        description = sel.xpath('//meta[@property="og:description"]/@content').extract()

        print image

        ##image indiriliyor

        loader = XPathItemLoader(item, response = response)
        loader.add_xpath('image_urls', '//meta[@property="og:image"]/@content')     


        ##ID Split ediliyor (10. Design | Siyah & beyaz kalpli)

        id = name[0].strip().split(". ")
        id = id[0]

        item['id'] = id
        item['name'] = name
        item['url'] = response.url
        item['image'] = loader.load_item()
        item['category'] = "Basic"
        item['description'] = description
        item["brand"] = "2Loom"
        item['price'] = price
        item['price_lower'] = price_lower


        print item


        items.append(item)
        return items


Items

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class LoomItem(Item):
    # define the fields for your item here like:
    # name = Field()

    id = Field()
    name = Field()
    brand = Field()
    image = Field()
    category = Field()
    description = Field()
    price_lower = Field()
    price = Field()
    url = Field()
    images = Field()
    image_urls = Field()    

Pipeline

from scrapy.contrib.pipeline.images import ImagesPipeline, ImageException
from scrapy.http import Request
from cStringIO import StringIO
import psycopg2
import hashlib
from scrapy.conf import settings

class MyImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        return [Request(x) for x in item.get('image_urls', [])]   

    def item_completed(self, results, item, info):
        item['images'] = [x for ok, x in results if ok]
        return item

    # Override the convert_image method to disable image conversion    
    def convert_image(self, image, size=None):
        buf = StringIO()        
        try:
            image.save(buf, image.format)
        except Exception, ex:
            raise ImageException("Cannot process image. Error: %s" % ex)

        return image, buf    

    def image_key(self, url):
        image_guid = hashlib.sha1(url).hexdigest()
        return 'full/%s.jpg' % (image_guid)  

Settings

BOT_NAME = 'loom'

SPIDER_MODULES = ['loom.spiders']
NEWSPIDER_MODULE = 'loom.spiders'


DOWNLOAD_DELAY     = 5 

ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
IMAGES_STORE = '/root/loom/images/'

IMAGES_THUMBS = {
    'small': (90, 90),
    'big': (300, 300),
}

USER_AGENT     = "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0"
IM_MODULE      = 'loom.pipelines.MyImagePipeline'
ITEM_PIPELINES = ['loom.pipelines.MyImagePipeline']



LOG_LEVEL = 'INFO'  

我不知道为什么会出现这个错误。所以,谢谢你的帮助

1 个答案:

答案 0 :(得分:1)

在spyder中尝试更改sys.setrecursionlimit(10000)的递归限制。我的python解释器在" RuntimeError"

之前提供了900次递归