我编写了一个使用Scrapy库进行网页抓取的代码,遇到了循环抓取多页数据的问题,而价格栏又出现了错误的价格,请您能帮助我如何解决这些问题。这是我的代码:
# -*- coding: utf-8 -*-
import scrapy
class CurtainTestSpider(scrapy.Spider):
name = 'curtain_test'
allowed_domains = ['www.redbubble.com']
#start_urls = ['https://www.redbubble.com/shop/shower-curtains/']
def start_requests(self):
yield scrapy.Request(url='https://www.redbubble.com/shop/shower-curtains/', callback=self.parse, headers={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
})
def parse(self, response):
products = response.xpath("//div[@class='styles__grid--197Ps']/a")
for product in products:
link = product.xpath(".//@href").get()
yield scrapy.Request(url=link, callback=self.parse_item, headers={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
})
def parse_item(self, response):
title = response.xpath("//h1/text()").get()
price = response.xpath("//div[@class='ProductConfiguration__price--31GDW']/span/span/text()").get()
img = response.xpath("(//img[@class='GalleryImage__img--12Vov'])[3]/@src").get()
yield {
'Title' : title,
'price' : price,
'Img_Url' : img
}
next_page = response.xpath("(//a[@class='Pagination__namedLink--1dOFn'])[2]/@href").get()
#absolute_url = f"https://www.redbubble.com{next_page}"
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse, headers={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
})