Scrapy twisted.internet.error.ReactorNotRestartable错误

时间:2017-05-21 14:41:20

标签: python scrapy web-crawler

我有两个蜘蛛用于一个站点,我想在同一个进程中运行多蜘蛛,我该怎么办?我做了如下,但是当我运行'scrapy crawl example'时,我得到了像twisted.internet.error.ReactorNotRestartable这样的错误:

class ExampleMobilePhoneSpider(Spider):
    name = "example"
    allowed_domains = ["www.example.com", "example.com"]
    start_urls = (
        'https://search.example.com/api/search/?category=c11&pageno=0',
    )

    custom_settings = {
        "ITEM_PIPELINES": {
            'crawler_bot.pipelines.ExampleElectronicDevicePipeline': 100,
        }
    }

    def parse_item(self, response):
        js = json.loads(response.body.decode('utf-8'))
        hits = js['hits']['hits']
        for counter, hit in enumerate(hits):
            l = ItemLoader(item=ProductDetail(), response=response)
            m = hits[counter]['_source']

            # print(json.dumps(m, indent=4, sort_keys=True))
            l.add_value('enTitle', m['EnTitle'])
            l.add_value('faTitle', m['FaTitle'])
            l.add_value('minPrice', {"value": m['MinPrice'], "updateDate": datetime.datetime.now()})
            l.add_value('price', {"value": m['MinPriceList'], "updateDate": datetime.datetime.now()})
            l.add_value('maxPrice', {"value": m['MaxPrice'], "updateDate": datetime.datetime.now()})
            l.add_value('isActive', m['IsActive'])
            l.add_value('isEspecialOffer', m['IsSpecialOffer'])
            l.add_value('productCategories', m['ProductCategories'].split())
            l.add_value('imagePath', m['ImagePath'])
            l.add_value('hasVideo', m['HasVideo'])
            l.add_value('productColorList', m['ProductColorList'])
            l.add_value('localID', m['Id'])

            l.add_value('url', response.url)
            l.add_value('project', "example")
            l.add_value('subject', ["electronic_device", "mobile_phone", "mobile"])
            l.add_value('spider', self.name)
            l.add_value('server', socket.gethostname())
            l.add_value('date', datetime.datetime.now())
            l.add_value('collection', "electronic_device")

            # file_path = "https://file.example.com/example/"
            # l.add_value('images', image2base64.get_as_base64(file_path + m['ImagePath']))

            yield l.load_item()

    def parse(self, response):
        base_url_mobile = 'https://search.example.com/api/search/?category=c11&pageno='
        urls = [base_url_mobile + str(n) for n in range(2)]
        for url in urls:
            yield Request(urljoin(response.url, url), callback=self.parse_item)


class ExampleMobileAccessoriesSpider(Spider):
    name = "example"
    allowed_domains = ["www.example.com", "example.com"]
    start_urls = (
        'https://search.example.com/api/search/?category=c12&pageno=0',
    )

    custom_settings = {
        "ITEM_PIPELINES": {
            'crawler_bot.pipelines.ExampleElectronicDevicePipeline': 110,
        }
    }

    def parse_item(self, response):
        js = json.loads(response.body.decode('utf-8'))
        hits = js['hits']['hits']
        for counter, hit in enumerate(hits):
            l = ItemLoader(item=ProductDetail(), response=response)
            m = hits[counter]['_source']

            # print(json.dumps(m, indent=4, sort_keys=True))
            l.add_value('enTitle', m['EnTitle'])
            l.add_value('faTitle', m['FaTitle'])
            l.add_value('minPrice', {"value": m['MinPrice'], "updateDate": datetime.datetime.now()})
            l.add_value('price', {"value": m['MinPriceList'], "updateDate": datetime.datetime.now()})
            l.add_value('maxPrice', {"value": m['MaxPrice'], "updateDate": datetime.datetime.now()})
            l.add_value('isActive', m['IsActive'])
            l.add_value('isEspecialOffer', m['IsSpecialOffer'])
            l.add_value('productCategories', m['ProductCategories'].split())
            l.add_value('imagePath', m['ImagePath'])
            l.add_value('hasVideo', m['HasVideo'])
            l.add_value('productColorList', m['ProductColorList'])
            l.add_value('localID', m['Id'])

            l.add_value('url', response.url)
            l.add_value('project', "example")
            l.add_value('subject', ["electronic_device", "mobile_phone", "mobile", "accessory", "accessories"])
            l.add_value('spider', self.name)
            l.add_value('server', socket.gethostname())
            l.add_value('date', datetime.datetime.now())
            l.add_value('collection', "electronic_device")

            # file_path = "https://file.example.com/example/"
            # l.add_value('images', image2base64.get_as_base64(file_path + m['ImagePath']))

            yield l.load_item()

    def parse(self, response):
        base_url_mobile = 'https://search.example.com/api/search/?category=c11&pageno='
        urls = [base_url_mobile + str(n) for n in range(2)]
        for url in reversed(urls):
            yield Request(urljoin(response.url, url), callback=self.parse_item)
        return 0


SPIDER_LIST = [
    ExampleMobileAccessoriesSpider,
    ExampleMobilePhoneSpider,
]
# Running Multi-Spider
for spider in SPIDER_LIST:
    process = CrawlerProcess()
    process.crawl(spider)
    process.start()

    # the script will block here until all crawling jobs are finished

0 个答案:

没有答案