Scrapy在`crawler.configure()`而不是`reactor.run()`停止

时间:2015-06-02 01:16:50

标签: python web-scraping scrapy twisted

以下是启动抓取工具的代码

def start_crawler(pipe_connection):
    spider = RssSpider()
    crawler = Crawler(Settings())
    crawler.configure()  # Script stops here
    crawler.crawl(spider)  # This line is never reached
    crawler.start()
    log.start()
    reactor.run()  # Script should stop here

它的副本几乎直接来自本教程:http://scrapy.readthedocs.org/en/0.16/topics/practices.html

没有例外。我知道脚本停止的唯一原因是使用print语句进行错误测试。

以下是 RssSpider 定义(rss_spider.py):

def callback_filler(cb_item):
    pass


class SimpleSpider(scrapy.Spider):
    def parse(self, response):
        for link in self.parse_promising_links(response):
            yield scrapy.Request(link, callback=self.parse)

    link_extractor = LinkExtractor()

    def parse_promising_links(self, response):
        links = self.link_extractor.extract_links(response)
        urls = [link.url for link in links]
        return urls


class RssSpider(SimpleSpider):
    name = "rss"
    start_urls = [
        "http://www.npr.org/rss/",
        "http://www.cnn.com/services/rss/",
        "http://news.google.com/"
    ]

    def __init__(self, item_callback=callback_filler, *args, **kwargs):
        print("RSS SPIDER INITED")
        super(RssSpider, self).__init__(*args, **kwargs)
        self.item_callback = item_callback

    def parse(self, response):
        print("PARSE: ", response.url)
        if is_rss_atom(response):  # TODO: Change to search each page, get around depth limit
            parsed_response = self.parse_rss_response(response)
            self.item_callback(parsed_response)
            yield parsed_response
            return  # don't follow links from RSS feeds
        for to_yield in super(RssSpider, self).parse(response):
            yield to_yield

    def find_rss_in(self, response):
        for rss_link in parse_rss_links(response):
            yield RssItem({
                url: rss_link,
                discovery_date: Date(),
                name: None
                })

    def parse_rss_response(self, response):
        item = RssItem()
        item['url'] = response.url
        item['name'] = None
        item['discovery_date'] = int(round(time.time() * 1000))
        return item


def is_rss_atom(response):
    if 'Content-Type' not in response.headers:
        return False
    content_types = response.headers['Content-Type']
    return "application/rss+xml" in content_types or "application/atom+xml" in content_types

以下是调用start_crawler 的代码:

def setup():
    process, pipe_end = start_crawler_process()
    link_pipe_end_to_event(pipe_end, 'new_rss_feed')
    return process


def link_pipe_end_to_event(pipe_end, event_name):
    def thread_func():
        while True:
            item = pipe_end.recv()
            announce_item(item, event_name)
    thread = Thread(target=thread_func)
    thread.start()


def start_crawler_process():
    parent_conn, child_conn = Pipe()
    p = Process(target=start_crawler, args=(child_conn,))
    p.start()
    return p, parent_conn


def start_crawler(pipe_connection):
    spider = RssSpider()
    crawler = Crawler(Settings())
    crawler.configure()  # Script stops here
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()


def announce_item(item, event_name):
    pub.sendMessage(event_name, doc=item)

settings.py

BOT_NAME = 'crawler'

SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'

CONCURRENT_REQUESTS = 100
LOG_LEVEL = 'DEBUG'
COOKIES_ENABLED = False
RETRY_ENABLED = False
DOWNLOAD_TIMEOUT = 15
DEPTH_LIMIT = 50

0 个答案:

没有答案