def parse(self,response):
my_item={'test':123,'test2':321}
google_url = 'https://www.google.com/search?q=coffee+cans'
yield Request(url=google_url,callback=self.google,meta={'my_item':my_item})
def google(self,response):
my_item = response.meta['my_item']
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="r"]/a',allow='/dp',allow_domains='chewy.com'),
callback="chewy"),
Rule(LinkExtractor(restrict_xpaths='//div[@class="r"]/a',allow='/p/',allow_domains='homedepot.com'),
process_request=request.meta['my_item']=my_item,callback='homedepot')
)
def homedepot(self,response):
#my_item = response.meta['my_item']
错误消息:
Traceback (most recent call last):
File "/home/timmy/.local/bin/scrapy", line 11, in <module>
sys.exit(execute())
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/cmdline.py", line 149, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/crawler.py", line 251, in __init__
super(CrawlerProcess, self).__init__(settings)
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/crawler.py", line 137, in __init__
self.spider_loader = _get_spider_loader(settings)
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/crawler.py", line 338, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/spiderloader.py", line 61, in from_settings
return cls(settings)
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/spiderloader.py", line 25, in __init__
self._load_all_spiders()
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/spiderloader.py", line 47, in _load_all_spiders
for module in walk_modules(name):
File "/home/timmy/.local/lib/python3.6/site-packages/scrapy/utils/misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python3.6/importlib/__init__.py", line 126, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "<frozen importlib._bootstrap>", line 994, in _gcd_import
File "<frozen importlib._bootstrap>", line 971, in _find_and_load
File "<frozen importlib._bootstrap>", line 955, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 674, in exec_module
File "<frozen importlib._bootstrap_external>", line 781, in get_code
File "<frozen importlib._bootstrap_external>", line 741, in source_to_code
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/home/timmy/scrapy_tut/myproject/spiders/amazon.py", line 62
process_request=request.meta['my_item']=my_item,callback='homedepot')
^
SyntaxError: invalid syntax
我对问题进行了编辑,使其更具可测试性,如何将my_item
传递给从Rule(LinkExtractor...)
提取的链接(我将规则从Spider的初始化中移了出来,以使我更容易做到(使用meta),但我仍然做不到。
非常感谢您的帮助
我尝试使用
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="r"]/a',allow='/dp',allow_domains='chewy.com'),
process_request=lambda request:request.meta.update({'my_item':my_item}),callback='chewy'),
Rule(LinkExtractor(restrict_xpaths='//div[@class="r"]/a',allow='/p/',allow_domains='homedepot.com')
,process_request=lambda request:request.meta.update({'my_item':my_item}),callback='homedepot')
)
这没有错误,但是没有请求页面
答案 0 :(得分:0)
第一个示例是错误的Python代码,如Python报告所示。
您的第二个示例不起作用,因为您对process_request
的lambda
函数的Rule
参数的回调返回了None
。
如果您选中documentation:
process_request
是可调用的,或者是一个字符串(在这种情况下,将使用具有该名称的Spider对象中的方法),该规则将提取该请求中的每个请求,并返回一个请求或无(以过滤掉请求)。
这实际上不是它不起作用的唯一原因。要使用基于规则的链接提取器,您必须:
子类CrawlSpider
。从您的示例中尚不清楚您是否这样做。
不要像现在那样在子类中重新实现parse
方法。如果start_urls
对您来说不够好,请与parse_start_url
结合使用。
规则必须声明为class attribute。相反,您将它们定义为Spider子类的方法中的变量。那行不通。
请重新阅读the documentation about the CrawlSpider。
关于将值从响应的元数据传递到下一个请求的元数据,您有两种选择:
将您的Spider重新实现为Spider
子类,而不是CrawlSpider
子类,从而手动执行所有逻辑,而无需基于规则的链接提取器。
每当像CrawlSpider
这样的通用蜘蛛开始感到过于严格时,这是自然的步骤。通用Spider子类适用于简单的用例,但是每当遇到不平凡的事情时,都应考虑切换到常规Spider
子类。
等待Scrapy 1.7 to be released,这应该很快发生(同时,您可以使用Scrapy的master
分支)。 Scrapy 1.7 introduces a new response
parameter for process_request
callbacks,它将使您能够执行以下操作:
def my_request_processor(request, response):
request.meta['item'] = response.meta['item']
return request
class MySpider(CrawlSpider):
# …
rules = (
Rule(
LinkExtractor(
restrict_xpaths='//div[@class="r"]/a',
allow='/p/',
allow_domains='homedepot.com'
),
process_request=my_request_processor,
callback='homedepot'
)
)
# …