如何通过scrapy下载时为图像提供自定义名称

时间:2015-08-03 05:29:52

标签: python web-crawler scrapy

这是我通过图像管道下载图像的程序。它运行良好并下载图像,但问题**是否重命名sha1哈希中的图像,之后我无法识别它们。可以有任何解决方案,以便我可以在下载图像时使用** model_name 吗?

   import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time

class CompItem(scrapy.Item):
    model_name = scrapy.Field()
    images = scrapy.Field()
    image_urls = scrapy.Field()
    image_name = scrapy.Field()

class criticspider(CrawlSpider):
    name = "buysmaart_images"
    allowed_domains = ["http://buysmaart.com/"]
    start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4",  "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye",  "http://buysmaart.com/productdetails/506/OPPO-N1",  "http://buysmaart.com/productdetails/342/LG-G2-D802T"]

    def __init__(self, *args, **kwargs):
        super(criticspider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(2)

    def parse_start_url(self, response):
        self.browser.get(response.url)
        time.sleep(8)
        sel = Selector(text=self.browser.page_source)
        item = CompItem()

        photos = sel.xpath('//ul[contains(@id,"productImageUl")]/li')
        print len(photos)
        all_photo_urls = []
        for photo in photos:
            item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
            #tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')
            image_url = photo.xpath('.//img/@src').extract()[0]
            all_photo_urls.append(image_url)
            item['image_urls'] = all_photo_urls
        yield item

管道

    from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
    def process_item(self, item, spider):
         def get_media_requests(self, item, info):
        return [Request(x, meta={'image_names': item["image_name"]})
                for x in item.get('image_urls', [])]

def get_images(self, response, request, info):
    for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
        if re.compile('^[0-9,a-f]+.jpg$').match(key):
            key = self.change_filename(key, response)
        yield key, image, buf

def change_filename(self, key, response):
    return "%s.jpg" % response.meta['image_name'][0]

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

设置

BOT_NAME = 'download_images'

SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'

2 个答案:

答案 0 :(得分:3)

Scrapy 1.3.3解决方案(覆盖image_downloaded方法):

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            path = 'full/%s' % response.meta['image_names'][0] # **Here Changed**
            self.store.persist_file(
                path, buf, info,
                meta={'width': width, 'height': height},
                headers={'Content-Type': 'image/jpeg'})
        return checksum

答案 1 :(得分:1)

解决方案是覆盖image_key类的DownloadImagesPipeline方法。

def image_key(self, url):
    return 'image_name.here'

例如,如果您想要可以使用的URL的图像名称

url.split('/')[-1]

作为图像的名称。 注意,此方法已弃用,可在将来的版本中删除。

或者,您可以在image_name

中为图片设置Spider
item['image_name'] = ['whatever_you_want']

在这种情况下,您需要更多地扩展管道以利用您提供的图像的名称:

def get_media_requests(self, item, info):
        return [Request(x, meta={'image_names': item["image_name"]})
                for x in item.get('image_urls', [])]

def get_images(self, response, request, info):
    for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
        if re.compile('^[0-9,a-f]+.jpg$').match(key):
            key = self.change_filename(key, response)
        yield key, image, buf

def change_filename(self, key, response):
    return "%s.jpg" % response.meta['image_name'][0]

当然,您的管道应该扩展ImagesPipeline