我用scrapy从一个中文网站上获取图片:https://www.meishij.net/china-food/caixi/
可以从URL的SHA1哈希中下载带有名称的图片。但是我想给图片添加新的名称,即item ['image_names'],然后根据item ['cuisine_names']将其放入不同的文件夹中。当我尝试更改管道代码时,没有下载图片,并且出现警告:该项目中没有图片。
我应该怎么做才能解决问题?
蜘蛛文件meishijie.py如下:
import scrapy
import re
from ..items import ImgItem
class MeishijieSpider(scrapy.Spider):
name = "meishijie"
allowed_domains = ["meishij.net"]
start_urls = ['https://www.meishij.net/china-food/caixi/']
def parse(self, response):
cuisine_list=response.xpath('//dl[@class="listnav_dl_style1 w990 clearfix"]//dd/a/@href').extract()
# extract the link of each cuisine
#print(len(link_list)) # the amount of the cuicines
for cuisine_url in cuisine_list:
#print(cuisine_url)
yield scrapy.Request(cuisine_url,callback=self.parse_cuisine_img)
def parse_cuisine_img(self,response):
item=ImgItem()
item['image_urls'] = response.xpath('//img[@class="img"]//@src').extract()
item['image_names'] = response.xpath('//div[@class="c1"]//strong//text()').extract()
#item['vc'] = response.xpath('//div[@class="c1"]//span//text()').extract()
#print(len(item['image_urls']))
# get the url of the next page
next_link=response.xpath('//div[@class="listtyle1_page"]//a[@class="next"]//@href').extract()
split_name=re.split('/',next_link[0])
cuisine=split_name[-2] # get the name of each cuisine
item['cuisine_names']=cuisine
#print(item['cuisine_names'])
#print(item['image_names'])
#print(item['image_urls'])
#print(item['cuisine_names'])
yield item
if next_link:
next_link = next_link[0]
#print(next_link)
yield scrapy.Request(next_link,callback=self.parse_cuisine_img)
pipelines.py
import re
import os
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
import json
import codecs
import shutil
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file=codecs.open('meishijie.json','w',encoding='utf-8')
def process_item(self,item,spider):
line = json.dumps(dict(item),ensure_ascii=False) + '\n'
self.file.write(line)
return item
def spider_closed(self,spider):
self.file.close()
class ImgPipeline(ImagesPipeline):
def get_media_request(self,item,info):
for img_url in item['image_urls']:
yield Request(img_url,meta={'item':item,'index':item['image_urls'].index(img_url)})
def item_completed(self,results,item,info):
image_paths = [x['path'] for ok,x in results if ok]
if not image_paths:
raise DropItem('Item contains no images')
#item['image_paths'] = image_paths
return item
def file_path(self, request, response=None, info=None):
item = request.meta['item']
index = request.meta['index']
folder_name=item['cuisine_names']
image_guid = item['image_names'][index]
filename = u'full/{0}/{1}'.format(folder_name,image_guid)
return filename
items.py
import scrapy
class ImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
cuisine_names = scrapy.Field()
image_names = scrapy.Field()
image_urls = scrapy.Field()
#vc = scrapy.Field()
#images=scrapy.Field()
settings.py
BOT_NAME = 'img'
SPIDER_MODULES = ['img.spiders']
NEWSPIDER_MODULE = 'img.spiders'
ITEM_PIPELINES = {
'img.pipelines.ImgPipeline': 1,
'img.pipelines.JsonWithEncodingPipeline':2,
#'img.pipelines.ImgPipeline':1
}
IMAGES_STORE='.../food-image/pic'