这是我通过图像管道下载图像的程序。它运行良好并下载图像,但问题**是否重命名sha1哈希中的图像,之后我无法识别它们。可以有任何解决方案,以便我可以在下载图像时使用** model_name 吗?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
class CompItem(scrapy.Item):
model_name = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
image_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "buysmaart_images"
allowed_domains = ["http://buysmaart.com/"]
start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4", "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye", "http://buysmaart.com/productdetails/506/OPPO-N1", "http://buysmaart.com/productdetails/342/LG-G2-D802T"]
def __init__(self, *args, **kwargs):
super(criticspider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(2)
def parse_start_url(self, response):
self.browser.get(response.url)
time.sleep(8)
sel = Selector(text=self.browser.page_source)
item = CompItem()
photos = sel.xpath('//ul[contains(@id,"productImageUl")]/li')
print len(photos)
all_photo_urls = []
for photo in photos:
item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
#tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')
image_url = photo.xpath('.//img/@src').extract()[0]
all_photo_urls.append(image_url)
item['image_urls'] = all_photo_urls
yield item
管道
from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
def process_item(self, item, spider):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
设置
BOT_NAME = 'download_images'
SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'
答案 0 :(得分:3)
Scrapy 1.3.3解决方案(覆盖image_downloaded
方法):
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
path = 'full/%s' % response.meta['image_names'][0] # **Here Changed**
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
答案 1 :(得分:1)
解决方案是覆盖image_key
类的DownloadImagesPipeline
方法。
def image_key(self, url):
return 'image_name.here'
例如,如果您想要可以使用的URL的图像名称
url.split('/')[-1]
作为图像的名称。 注意,此方法已弃用,可在将来的版本中删除。
或者,您可以在image_name
:
Spider
item['image_name'] = ['whatever_you_want']
在这种情况下,您需要更多地扩展管道以利用您提供的图像的名称:
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
当然,您的管道应该扩展ImagesPipeline
。