如果我多次在方法中调用此代码,则会失败,但终端中不会显示错误。它只运行一次。是不是可以用同一只蜘蛛重新爬行两次? 它在行reactor.run()失败,并且蜘蛛第二次调用时从不运行,但日志中没有错误。
def crawlSite(self):
self.mySpider = MySpider()
self.mySpider.setCrawlFolder(self.website)
settings = get_project_settings()
settings.set('DEPTH_LIMIT', self.depth)
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(self.mySpider)
crawler.start()
log.start(logfile="results.log", loglevel=log.ERROR, crawler=crawler, logstdout=False) #log.DEBUG
reactor.run() # the script will block here until the spider_closed signal was sent
这是MySpider类
class MySpider(CrawlSpider):
name = "mysite"
crawlFolder = ""
crawlFolder1 = ""
crawlFolder2 = ""
allowed_domains = ["mysite.ca"]
start_urls = [ "http://www.mysite.ca" ]
rules = [ Rule(SgmlLinkExtractor(allow=(r'^http://www.mysite.ca/',), unique=True), callback='parse_item', follow=True), ]
def parse_item(self, response):
#store data in a website item object
item = WebsiteClass()
item['title'] = response.selector.xpath('//title/text()').extract()
item['body'] = response.selector.xpath('//body').extract()
item['url'] = response.url
...
然后我有一个在CrawlerClass中调用crawlSite()的SetupClass
self.crawlerClass.crawlSite()