我正在尝试了解Scrapy的基础知识,并且正在阅读一些教程。 我正在尝试从意大利房屋网站https://www.immobiliare.it抓取数据。我想访问首页(起始网址),然后从那里抓取页面以下的网站
hompage->省/市->列表->房屋公告
我想出了这段代码,但是它不起作用,我也不明白为什么。如果我确实对单个代码段进行调试,则它们可以正常工作,但总体而言,蜘蛛程序不会产生任何结果,因为它会停止在主页上。有人可以提示我错误在哪里吗?谢谢。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from immobiliare.items import ImmobiliareItem
class ScrimmoSpider(CrawlSpider):
name = 'scrimmo'
allowed_domains = ['www.immobiliare.it']
start_urls = ["https://www.immobiliare.it/"]
rules = (
Rule(LinkExtractor(allow=(), restrict_css=('#listing-pagination > ul.pull-right.pagination > li:nth-child(1) > a',)),
callback="parse_item",
follow=True),
)
def parse_homepage(self, response):
loc_links = response.css('.home-city-search__city--residenziale > a::attr(href)').extract()
for loc in loc_links:
yield scrapy.Request(loc, callback=self.parse_comuni)
self.log('\tProcessing location URL: ' + response.url)
def parse_comuni(self, response):
comuni_links = response.css('.comuni__comune-name::attr(href)').extract()
for comune in comuni_links:
yield scrapy.Request(comune, callback=self.parse_item)
print('\nProcessing comune URL: ' + response.url)
def parse_item(self, response):
item_links = response.css('.text-primary >a::attr(href)').extract()
for a in item_links:
yield scrapy.Request(a, callback=self.parse_detail_page)
print('\nProcessing... ' + response.url)
next_page_url = response.css('#listing-pagination > ul.pull-right.pagination > li:nth-child(1) > a')
next_page_url = response.urljoin(next_page_url)
def parse_detail_page(self, response):
localita = response.css('h1::text').extract()[0].split(",")[-1].strip()
prezzo = response.css('.features__price > span::text').extract()[0].strip("€").strip().replace(".","")
numero_locali = response.css('.features__list > li > div > span ::text').extract()[0].split(",")[0].replace(u'\xa0', u' ').strip()
superficie = response.css('.features__list > li > div > span ::text').extract()[1]
item = ImmobiliareItem()
item['localita'] = localita
item['prezzo'] = prezzo
item['numero_locali'] = numero_locali
item['superficie'] = superficie
yield item