我有两个蜘蛛用于一个站点,我想在同一个进程中运行多蜘蛛,我该怎么办?我做了如下,但是当我运行'scrapy crawl example'时,我得到了像twisted.internet.error.ReactorNotRestartable这样的错误:
class ExampleMobilePhoneSpider(Spider):
name = "example"
allowed_domains = ["www.example.com", "example.com"]
start_urls = (
'https://search.example.com/api/search/?category=c11&pageno=0',
)
custom_settings = {
"ITEM_PIPELINES": {
'crawler_bot.pipelines.ExampleElectronicDevicePipeline': 100,
}
}
def parse_item(self, response):
js = json.loads(response.body.decode('utf-8'))
hits = js['hits']['hits']
for counter, hit in enumerate(hits):
l = ItemLoader(item=ProductDetail(), response=response)
m = hits[counter]['_source']
# print(json.dumps(m, indent=4, sort_keys=True))
l.add_value('enTitle', m['EnTitle'])
l.add_value('faTitle', m['FaTitle'])
l.add_value('minPrice', {"value": m['MinPrice'], "updateDate": datetime.datetime.now()})
l.add_value('price', {"value": m['MinPriceList'], "updateDate": datetime.datetime.now()})
l.add_value('maxPrice', {"value": m['MaxPrice'], "updateDate": datetime.datetime.now()})
l.add_value('isActive', m['IsActive'])
l.add_value('isEspecialOffer', m['IsSpecialOffer'])
l.add_value('productCategories', m['ProductCategories'].split())
l.add_value('imagePath', m['ImagePath'])
l.add_value('hasVideo', m['HasVideo'])
l.add_value('productColorList', m['ProductColorList'])
l.add_value('localID', m['Id'])
l.add_value('url', response.url)
l.add_value('project', "example")
l.add_value('subject', ["electronic_device", "mobile_phone", "mobile"])
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
l.add_value('collection', "electronic_device")
# file_path = "https://file.example.com/example/"
# l.add_value('images', image2base64.get_as_base64(file_path + m['ImagePath']))
yield l.load_item()
def parse(self, response):
base_url_mobile = 'https://search.example.com/api/search/?category=c11&pageno='
urls = [base_url_mobile + str(n) for n in range(2)]
for url in urls:
yield Request(urljoin(response.url, url), callback=self.parse_item)
class ExampleMobileAccessoriesSpider(Spider):
name = "example"
allowed_domains = ["www.example.com", "example.com"]
start_urls = (
'https://search.example.com/api/search/?category=c12&pageno=0',
)
custom_settings = {
"ITEM_PIPELINES": {
'crawler_bot.pipelines.ExampleElectronicDevicePipeline': 110,
}
}
def parse_item(self, response):
js = json.loads(response.body.decode('utf-8'))
hits = js['hits']['hits']
for counter, hit in enumerate(hits):
l = ItemLoader(item=ProductDetail(), response=response)
m = hits[counter]['_source']
# print(json.dumps(m, indent=4, sort_keys=True))
l.add_value('enTitle', m['EnTitle'])
l.add_value('faTitle', m['FaTitle'])
l.add_value('minPrice', {"value": m['MinPrice'], "updateDate": datetime.datetime.now()})
l.add_value('price', {"value": m['MinPriceList'], "updateDate": datetime.datetime.now()})
l.add_value('maxPrice', {"value": m['MaxPrice'], "updateDate": datetime.datetime.now()})
l.add_value('isActive', m['IsActive'])
l.add_value('isEspecialOffer', m['IsSpecialOffer'])
l.add_value('productCategories', m['ProductCategories'].split())
l.add_value('imagePath', m['ImagePath'])
l.add_value('hasVideo', m['HasVideo'])
l.add_value('productColorList', m['ProductColorList'])
l.add_value('localID', m['Id'])
l.add_value('url', response.url)
l.add_value('project', "example")
l.add_value('subject', ["electronic_device", "mobile_phone", "mobile", "accessory", "accessories"])
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
l.add_value('collection', "electronic_device")
# file_path = "https://file.example.com/example/"
# l.add_value('images', image2base64.get_as_base64(file_path + m['ImagePath']))
yield l.load_item()
def parse(self, response):
base_url_mobile = 'https://search.example.com/api/search/?category=c11&pageno='
urls = [base_url_mobile + str(n) for n in range(2)]
for url in reversed(urls):
yield Request(urljoin(response.url, url), callback=self.parse_item)
return 0
SPIDER_LIST = [
ExampleMobileAccessoriesSpider,
ExampleMobilePhoneSpider,
]
# Running Multi-Spider
for spider in SPIDER_LIST:
process = CrawlerProcess()
process.crawl(spider)
process.start()
# the script will block here until all crawling jobs are finished