我有一只蜘蛛可以获取数据和图像。我想用我正在提取的相应“标题”重命名图像。
以下是我的代码:
spider1.py
from imageToFileSystemCheck.items import ImagetofilesystemcheckItem
import scrapy
class TestSpider(scrapy.Spider):
name = 'imagecheck'
def start_requests(self):
searchterms=['keyword1','keyword2',]
for item in searchterms:
yield scrapy.Request('http://www.example.com/s?=%s' % item,callback=self.parse, meta={'item': item})
def parse(self,response):
start_urls=[]
item = response.meta.get('item')
for i in range(0,2):
link=str(response.css("div.tt a.chek::attr(href)")[i].extract())
start_urls.append(link)
for url in start_urls:
print(url)
yield scrapy.Request(url=url, callback=self.parse_info ,meta={'item': item})
def parse_info(self, response):
url=response.url
title=str(response.xpath('//*[@id="Title"]/text()').extract_first())
img_url_1=response.xpath("//img[@id='images']/@src").extract_first()
scraped_info = {
'url' : url,
'title' : title,
'image_urls': [img_url_1]
}
yield scraped_info
items.py
import scrapy
class ImagetofilesystemcheckItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pass
pipelines.py
class ImagetofilesystemcheckPipeline(object):
def process_item(self, item, spider):
return item
settings.py
BOT_NAME = 'imageToFileSystemCheck'
SPIDER_MODULES = ['imageToFileSystemCheck.spiders']
NEWSPIDER_MODULE = 'imageToFileSystemCheck.spiders'
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_STORE = '/home/imageToFileSystemCheck/images/'
ROBOTSTXT_OBEY = True
你能否帮我完成所需的修改,以便scrapy能够以'title'.jpg 格式保存被抓取的图像,其中标题被蜘蛛刮掉?
答案 0 :(得分:0)
像这样创建一个蜘蛛
class ShopeeSpider(scrapy.Spider):
_TEMP_IMAGES_STORE = "/home/crawler/scrapers/images"
custom_settings = {
'ITEM_PIPELINES': {
'coszi.pipelines.CustomImagePipeline': 400,
}
"IMAGES_STORE": _TEMP_IMAGES_STORE
}
def parse(self, response):
data = {}
data['images'] = {"image_link_here": "image_name_here"}
然后你的pipelines.py应该是这样的
class CustomImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
if 'images' in item:
for image_url, img_name in item['images'].iteritems():
if os.path.exists(os.path.join(item['images_path'], img_name)) == False:
request = scrapy.Request(url=image_url)
request.meta['img_name'] = img_name
request.meta['this_prod_img_folder'] = item['img_name_here']
request.dont_filter = True
yield request
def file_path(self, request, response=None, info=None):
return os.path.join(info.spider.CRAWLER_IMAGES_STORE, request.meta['this_prod_img_folder'], request.meta['img_name'])