我基于脚本arg动态设置蜘蛛规则,并且我希望设置规则以防止对多个页面进行爬网或跟踪所有链接,并且所有爬网都取决于arg。
这是我的代码
from scrapy.exceptions import CloseSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class LinksExtractor3Spider(CrawlSpider):
name = 'links_extractor3'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def __init__(self, *args, **kwargs):
self.flow = kwargs.get('flow')
if self.flow == 'multi':
print('multi')
self.rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
else:
print('single')
print(self.start_urls[0])
self.rules = (
Rule(LinkExtractor(), callback='parse_item', follow=False),
)
super(LinksExtractor3Spider, self).__init__(*args, **kwargs)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
yield item
我尝试使用Rule(LinkExtractor(),callback ='parse_item',follow = False),但看来我不理解Rule =))Follow = False不会阻止关注。
我还尝试通过这种方式设置LinkExtractor(allow =('start_urls [0]'))或添加另一个func parse_one_item和规则func的不同回调:
else:
print('single')
print(self.start_urls[0])
self.rules = (
Rule(LinkExtractor(), callback='parse_one_item', follow=False),
)
super(LinksExtractor3Spider, self).__init__(*args, **kwargs)
def parse_one_item(self, response):
print('I COUNTER {}'.format(self.i))
if self.i != 0:
raise CloseSpider('finished')
else:
item = {}
item['title'] = response.xpath('//head/title/text()').extract()
item['url'] = response.url
self.i += 1
yield item
但是所有这些都无法正常工作。