我是scrapy和python的新手,我正试图将parse_quotes中的item item ['author']传递给下一个parse_bio解析方法
我尝试了如scrapy文档中所示的request.meta和response.meta方法,但是没有成功。参见下面的代码。
感谢您的投入
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/login',
#'http://quotes.toscrape.com/page/2',
]
# Scraping a site with login
# Important: Cookie settings must be "True" to keep the login session alive
custom_settings = {'COOKIES_ENABLED': True}
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.parse_quotes
)
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links @ shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, callback = self.parse_bio)
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links @ shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
我希望从传递给parse_bio的parse_quotes中获取item ['author']
答案 0 :(得分:0)
我建议您以这种方式使用meta
:
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links @ shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, self.parse_bio,
meta={'author': item['author']}) # <- you set it here
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_data'] = response.meta.get('author') # <- you get it here
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links @ shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)