我正在编写一个带有以下两个函数的scraper,它位于爬行过程的底部。
def parse_summary(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
soup = BeautifulSoup(hxs.select("//div[@class='PrimaryContent']").extract()[0])
text = soup.get_text()
item['main_summary'] = text
summary_links = hxs.select("//ul[@class='module_leftnav']/li/a/@href").extract()
chap_summary_links = [urljoin(response.url, link) for link in summary_links]
for link in chap_summary_links:
print 'yielding request to chapter summary.'
yield Request(link, callback=self.parse_chap_summary_link, meta={'item': item})
def parse_chap_summary_link(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['chapter_summaries'] = hxs.select("//h1/text()").extract()
soup = BeautifulSoup(hxs.select("//div[@class='PrimaryContent']").extract()[0])
text = soup.get_text()
item['chapter_summaries'] += [text]
yield item
在parse_summary
的底部,我向parse_chap_summary_link
发出请求,以便从章节摘要页面中提取数据。这有效,但问题是输出给了我:
{item 1, [chapter 1 summary]}
{item 1, [chapter 2 summary]}
但我想:
{item 1, [Chapter 1 summary, Chapter 2 Summary]}
{item 2, [Chapter 1 summary, Chapter 2 Summary, Chapter 3 etc etc]}
如何将所有章节摘要信息整合到一个标题中,而不是为每个章节摘要创建新项目?
答案 0 :(得分:0)
选项是逐个执行每个请求。例如
def parse_summary(self, response):
# ...
links = [urljoin(response.url, link) for link in summary_links]
return self._dispatch_summary_request(item, links)
def parse_chap_summary_link(self, response):
item = response.meta['item']
# ... collect summary into the item field
return self._dispatch_summary_request(item, response.meta['summary_links'])
def _dispatch_summary_request(self, item, links):
try:
next_link = links.pop()
except IndexError:
# no links left
return item
else:
# TODO: it might happen that one request fails and to not lose the item
# the request must have an errback callback to handle the failure and
# resume the next summary request.
return Request(next_link, meta={'item': item, 'summary_links': links},
callback=self.parse_chap_summary_link)
另一种选择是使用inline_requests装饰器:
@inline_requests
def parse_summary(self, response):
# ...
for link in chap_summary_links:
try:
response = yield Request(link)
except Exception:
# TODO: handle the error, log or something
pass
else:
# extract the summary as in parse_chap_summary_link ...
item['chapter_summaries'] += [text]
# Must use yield at the end as this callback is a generator
# due the previous yield statements.
yield item