我正在制作一个带scrapy的网络抓取工具,我得到了我想要的信息,但只有前8页才会抓取每一页而不获取任何数据
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class InfoSpider(CrawlSpider):
name = "info"
start_urls = [
'http://dounai-lavein.gr/catalog/cat/cars/'
]
rules = (
Rule(LinkExtractor(allow=(), restrict_css=('div.item-featured',)),
callback="parse",
follow=True),)
def parse(self, response):
for quote in response.css('div.item-featured'):
yield {
'text': quote.css('div.item-title a h3::text').extract_first(),
'owner': quote.css('div.entry-content p.txtrows-4::text').extract(),
'address': quote.css('.item-address span.value::text').extract_first(),
'web_address': quote.css('.item-web span.value a::attr(href)').extract(),
'image_link': quote.css('.item-image img').xpath("@src").extract_first()[0]
}
next_page = response.css('span.nav-next a::attr(href)').extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
我该怎么做才能解决它?
答案 0 :(得分:0)
您正在抓取的网站的以下内容不再具有 item-featured 类。 试试这个:
...
rules = (
Rule(LinkExtractor(allow=(), restrict_css=('div.item-container',)),
callback="parse",
follow=True),)
def parse(self, response):
for quote in response.css('div.item-container'):
yield {
...