我正在尝试阅读索引页,以从报价网站中检索报价类别,以了解其中的问题。我是新来的!
我可以用我的代码读取单个页面(类别),但是我想读取索引页面以读取报价页面。
def parse_item
部分适用于各个页面。但是,我无法获得LinkExtractor
部分来推断链接。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ['website.com']
start_urls = [
'https://www.website.com/topics'
]
rules = (
Rule(LinkExtractor(allow=('^\/topics.*', )), callback='parse_item')
)
def parse_item(self, response):
for quote in response.css('#quotesList .grid-item'):
yield {
'text': quote.css('a.oncl_q::text').extract_first(),
'author': quote.css('a.oncl_a::text').extract_first(),
'tags': quote.css('.kw-box a.oncl_list_kc::text').extract(),
'category' : response.css('title::text').re(r'(\w+).*')
}
next_page = response.css('div.bq_s.hideInfScroll > nav > ul > li:nth-last-child(1) a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
答案 0 :(得分:0)
这是您的错误:
yield scrapy.Request(next_page, callback=self.parse)
您的方法解析在哪里?
像这样更改---->
yield scrapy.follow(url=next_page, callback=self.parse_item)
答案 1 :(得分:0)
我已经解决了这个问题。尽管可以使用Rule(LinkExtractor
解决此问题,但我改用了response.css查询的级联来跟踪主题页面上的链接。
这是最终的工作版本...
import scrapy
class QuotesBrainy(scrapy.Spider):
name = 'Quotes'
start_urls = ['https://www.website.com/topics/']
def parse(self, response):
# follow links to topic pages
for href in response.css('a.topicIndexChicklet::attr(href)'):
yield response.follow(href, self.parse_item)
def parse_item(self, response):
# iterate through all quotes
for quote in response.css('#quotesList .grid-item'):
yield {
'text': quote.css('a.oncl_q::text').extract_first(),
'author': quote.css('a.oncl_a::text').extract_first(),
'tags': quote.css('.kw-box a.oncl_list_kc::text').extract(),
'category' : response.css('title::text').re(r'(\w+).*')
}
# go through the pagination links to access infinite scroll
next_page = response.css('div.bq_s.hideInfScroll > nav > ul > li:nth-last-child(1) a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse_item)