我需要从每个商品详情页面中提取3个值,但是由于某种原因,蜘蛛似乎没有遵循下一页的分页链接。
我用抓狂的runpider -s USER_AGENT ='Googlebot'myspider.py运行蜘蛛
# -*- coding: utf-8 -*-
import scrapy
class AuthorsSpider(scrapy.Spider):
name = 'njuskalo'
allowed_domains = ['njuskalo.hr/']
start_urls = ['https://www.njuskalo.hr/agencija/domino-nekretnine']
def parse(self, response):
urls = response.css('h3.entity-title > a.link::attr(href)').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
# follow pagination link
next_page_url = response.xpath('//*[@id="form_browse_detailed_search"]/div[2]/div[1]/div[2]/div[1]/nav/ul/li[last()]/a/@href').extract_first()
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
pattern = r'"displayCountText":(\d+),'
yield {
'naslov': response.css('h1.entity-title::text').extract(),
'agencijska_sifra': response.xpath('//th[contains(text(),"Šifra objekta:")]/following-sibling::td/text()').extract(),
'broj_prikaza': response.xpath('//script').re(pattern),
}