我正在使用scrapy,我正在尝试使用芹菜在一台机器上管理多个蜘蛛。我遇到的问题(有点难以解释),蜘蛛成倍增加 - >意思是如果我的第一个蜘蛛开始并且我开始第二个蜘蛛第一个蜘蛛执行两次。
请在此处查看我的代码:
ProcessJob.py
class ProcessJob():
def processJob(self, job):
#update job
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
job.status = 1
update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
cur.execute(update)
db.commit()
db.close()
#Start new crawler
configure_logging()
webspider = spider.MySpider;
if job.ajax == 1:
webspider.custom_settings = CrawlerSettings.ajax_settings;
else:
webspider.custom_settings = CrawlerSettings.normal_settings;
crawler = UrlCrawlerScript(webspider, job)
crawler.start()
crawler.join()
reactor.stop(0)
class UrlCrawlerScript(Process):
def __init__(self, spider, job):
Process.__init__(self)
self.crawler = CrawlerRunner()
self.crawler.crawl(spider, job=job)
def run(self):
d = self.crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run(0)
蜘蛛:
def __init__(self, job):
self.job = job
#Get the hosts
allowedDomainsPre = job.url.split(",")
allowedDomains = []
for domains in allowedDomainsPre:
parsed_uri = urlparse(domains)
domain = '{uri.netloc}'.format(uri=parsed_uri)
print domain
allowedDomains.append(domain)
self.allowed_domains = allowedDomains
self.start_urls = allowedDomainsPre
#Get job patterns
jobPatterns = job.processing_patterns.split(",")
allowedPatterns = []
deniedPatterns = []
for pattern in jobPatterns:
if '-' in pattern:
deniedPatterns.append(pattern.replace("-",""))
else:
allowedPatterns.append(pattern)
self._rules = [
Rule(LinkExtractor(allow=(allowedPatterns), deny=(deniedPatterns)), callback=self.parse_items, follow=True)
]
self.name = job.id
def parse_items(self, response):
item = Item()
item['html'] = response.body
item['url'] = response.url
item['job_id'] = self.job.id
return item
这是做什么的: 我从我的数据库中检索新的作业(这里没有这个代码 - 会有点太多)。然后我想处理它们(运行蜘蛛)。正如我所说的问题是,当我一次执行两个作业时,第一个蜘蛛“加倍”(因此并行执行两次)。
有任何建议如何解决这个问题?它可能再次成为反应堆的问题:(