Settings.py
## b3 p0lit3
USER_AGENT = ' *companyname* TUTORIAL BOT - (*myemail*) | No content Generated will be used - For Educational Purpose'
DOWNLOAD_DELAY = 5.0
AUTOTHROTTLE_ENABLED = True
HTTPCACHE_ENABLED = True
BOT_NAME = 'flaticontest'
SPIDER_MODULES = ['flaticontest.spiders']
NEWSPIDER_MODULE = 'flaticontest.spiders'
IMAGES_STORE = '/home/scriptso/Desktop/flattetstn1'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
items.py
import scrapy
class FlaticontestItem(scrapy.Item):
images = scrapy.Field()
image_urls = scrapy.Field()
title = scrapy.Field()
pachName = scrapy.Field()
image_name = scrapy.Field()
pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
class FlaticontestPipeline(object):
def process_item(self, item, spider):
return item
class CustomImageNamePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_name': item["image_name"]})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
我的蜘蛛...... fltSpi.py
import scrapy
from flaticontest.items import FlaticontestItem
class FltspiSpider(scrapy.Spider):
name = "fltSpi"
allowed_domains = ["flaticon.com"]
start_urls = []
for num in range(1,2000):
start_urls.append("http://www.flaticon.com/free-icons/computing_23394/" + str(num))
def parse(self, response):
for icon in response.css('.icon'):
yield {
'title': icon.css('img').re('title=\"(.*?)\"'),
'image_urls': icon.css('img').re('set=\"(.*?) 4x'),
'pach-name': icon.css('li').re('data-pack="(.*)\" '),
'image_name': icon.css('img').re('title=\"(.*?)\"'),
}
很难理解管道衬里背后的逻辑,但是你能发现我在这里做错了吗?我almsot积极的问题是在管道中的某个地方(显然)......任何人都想指出我正确的方向?!
更多问题排查是我目前所处的位置。
settings.py
USER_AGENT = 'BASH.SEC TUTORIAL BOT - (bash.sec@multuslegio.net) | No content Generated will be used -Educational Purpose'
DOWNLOAD_DELAY = 5.0
AUTOTHROTTLE_ENABLED = True
HTTPCACHE_ENABLED = True
BOT_NAME = 'flaticontest'
SPIDER_MODULES = ['flaticontest.spiders']
NEWSPIDER_MODULE = 'flaticontest.spiders'
IMAGES_STORE = '/home/scriptso/Desktop/flattetstn1'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
ITEM_PIPELINES = {'flaticontest.pipelines.CustomImageNamePipeline': 1}
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
class FlaticontestPipeline(object):
def process_item(self, item, spider):
return item
class CustomImageNamePipeline(ImagesPipeline):
def process_item(self, item, spider):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_name': item["title"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(CustomImageNamePipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def file_path(self, request, response=None, info=None):
return '%s.jpg'% request.meta['image_name']
items.py
import scrapy
class FlaticontestItem(scrapy.Item):
images = scrapy.Field()
image_urls = scrapy.Field()
title = scrapy.Field()
pachName = scrapy.Field()
image_name = scrapy.Field()
{{spyder的}}。PY
import scrapy
from flaticontest.items import FlaticontestItem
from flaticontest.pipelines import *
class FltspiSpider(scrapy.Spider):
name = "fltSpi"
allowed_domains = ["flaticon.com"]
start_urls = []
for num in range(1,2000):
start_urls.append("http://www.flaticon.com/free-icons/computing_23394/" + str(num))
def parse(self, response):
for icon in response.css('.icon'):
yield {
'title': icon.css('img').re('title=\"(.*?)\"'),
'image_urls': icon.css('img').re('set=\"(.*?) 4x'),
'pach-name': icon.css('li').re('data-pack="(.*)\" '),
'image_name': icon.css('img').re('title=\"(.*?)\"'),
}
我觉得我越来越近......因为输出没有表现出来......
2017-01-20 12:48:58 [scrapy] DEBUG:从&lt; 200 http://www.flaticon.com/free-icons/computing_23394/54&gt;刮掉了 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/54&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/52&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/50&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/50&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/50&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/50&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/50&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/50&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 没有 2017-01-20 12:48:58 [scrapy] DEBUG:从<200 http://www.flaticon.com/free-icons/computing_23394/48&gt;刮掉 无
从它的外观,它递归遍历每个项目(输出只显示页面响应,但很明显它是)..但即使我没有得到我的项目由于某种原因返回我还没有排除故障,我相信我的管道设置应该是重命名下载的图像......