我想使用一个子类来使用不同的URL运行我的Spider并通过循环保存信息。这是我的方法:
class Run_Spider_From_SubClass:
def __init__(self, url_list, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_list = url_list
def run_spider_in_loop(self):
#here I start feeding urls, but more important I can save info for each
#crawling process, and then restart the process with another url along
#loop.
for url in self.url_list:
process = CrawlerProcess(get_project_settings()) # this drive the scrapy to use configuration in 'settings.py', all class must start from this
process.crawl('MySpider', url)
process.start()
save_info # any codes that save info from spider into target file
此后,我遇到反应堆问题:
Traceback (most recent call last):
File "G:/python/test/scrapy_test/test4.py", line 45, in <module>
Run_Spider_From_SubClass(url_list).run_spider_in_loop()
File "G:/python/test/scrapy_test/test4.py", line 34, in run_spider_in_loop
process.start()
File "C:\Users\super\AppData\Roaming\Python\Python36\site-
packages\scrapy\crawler.py", line 291, in start
reactor.run(installSignalHandlers=False) # blocking call
File "C:\Users\super\AppData\Roaming\Python\Python36\site-
packages\twisted\internet\base.py", line 1266, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "C:\Users\super\AppData\Roaming\Python\Python36\site-
packages\twisted\internet\base.py", line 1246, in startRunning
ReactorBase.startRunning(self)
File "C:\Users\super\AppData\Roaming\Python\Python36\site-
packages\twisted\internet\base.py", line 754, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
如何克服这个反应堆错误,谢谢。
答案 0 :(得分:1)
我不确定您打算在save_info
中做什么,但这是一个最小的示例,该示例连续多次运行同一蜘蛛。它基于您的课程和documentation中的示例:
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
class Run_Spider_From_SubClass:
def __init__(self, url_list, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_list = url_list
configure_logging()
self.runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def crawl(self):
for url in self.url_list:
yield self.runner.crawl('MySpider', url=url)
reactor.stop()
def run_spider_in_loop(self):
self.crawl()
reactor.run()
urls = ['http://something.com', 'http://another.com']
runner = Run_Spider_From_SubClass(urls)
runner.run_spider_in_loop()