以下是启动抓取工具的代码:
def start_crawler(pipe_connection):
spider = RssSpider()
crawler = Crawler(Settings())
crawler.configure() # Script stops here
crawler.crawl(spider) # This line is never reached
crawler.start()
log.start()
reactor.run() # Script should stop here
它的副本几乎直接来自本教程:http://scrapy.readthedocs.org/en/0.16/topics/practices.html
没有例外。我知道脚本停止的唯一原因是使用print语句进行错误测试。
以下是 RssSpider 定义(rss_spider.py):
def callback_filler(cb_item):
pass
class SimpleSpider(scrapy.Spider):
def parse(self, response):
for link in self.parse_promising_links(response):
yield scrapy.Request(link, callback=self.parse)
link_extractor = LinkExtractor()
def parse_promising_links(self, response):
links = self.link_extractor.extract_links(response)
urls = [link.url for link in links]
return urls
class RssSpider(SimpleSpider):
name = "rss"
start_urls = [
"http://www.npr.org/rss/",
"http://www.cnn.com/services/rss/",
"http://news.google.com/"
]
def __init__(self, item_callback=callback_filler, *args, **kwargs):
print("RSS SPIDER INITED")
super(RssSpider, self).__init__(*args, **kwargs)
self.item_callback = item_callback
def parse(self, response):
print("PARSE: ", response.url)
if is_rss_atom(response): # TODO: Change to search each page, get around depth limit
parsed_response = self.parse_rss_response(response)
self.item_callback(parsed_response)
yield parsed_response
return # don't follow links from RSS feeds
for to_yield in super(RssSpider, self).parse(response):
yield to_yield
def find_rss_in(self, response):
for rss_link in parse_rss_links(response):
yield RssItem({
url: rss_link,
discovery_date: Date(),
name: None
})
def parse_rss_response(self, response):
item = RssItem()
item['url'] = response.url
item['name'] = None
item['discovery_date'] = int(round(time.time() * 1000))
return item
def is_rss_atom(response):
if 'Content-Type' not in response.headers:
return False
content_types = response.headers['Content-Type']
return "application/rss+xml" in content_types or "application/atom+xml" in content_types
以下是调用start_crawler 的代码:
def setup():
process, pipe_end = start_crawler_process()
link_pipe_end_to_event(pipe_end, 'new_rss_feed')
return process
def link_pipe_end_to_event(pipe_end, event_name):
def thread_func():
while True:
item = pipe_end.recv()
announce_item(item, event_name)
thread = Thread(target=thread_func)
thread.start()
def start_crawler_process():
parent_conn, child_conn = Pipe()
p = Process(target=start_crawler, args=(child_conn,))
p.start()
return p, parent_conn
def start_crawler(pipe_connection):
spider = RssSpider()
crawler = Crawler(Settings())
crawler.configure() # Script stops here
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
def announce_item(item, event_name):
pub.sendMessage(event_name, doc=item)
settings.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
CONCURRENT_REQUESTS = 100
LOG_LEVEL = 'DEBUG'
COOKIES_ENABLED = False
RETRY_ENABLED = False
DOWNLOAD_TIMEOUT = 15
DEPTH_LIMIT = 50