我正在尝试使用scrapy查找网站上所有页面中多次使用的图像URL。
这是我的蜘蛛:
# -*- coding: utf-8 -*-
from collections import defaultdict
import scrapy
from scrapy import signals
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ExampleSpider(CrawlSpider):
handle_httpstatus_list = [403, 404]
name = 'Example'
allowed_domains = ['url.com']
start_urls = ['http://url.com/']
custom_settings = {
'LOG_LEVEL': 'INFO'
}
count_image_occurrences = defaultdict(int)
rules = (
Rule(LinkExtractor(tags='a', attrs='href', unique=True),
callback='parse_item', follow=True),
)
def parse_item(self, response):
# Remember images.
for image in response.xpath('//img/@src').extract():
self.count_image_occurrences[image] += 1
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed,
signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
spider.logger.info(self.count_image_occurrences)
是否有更多(速度/内存/代码长度)有效的方法?