Question

我编写了一个使用Scrapy库进行网页抓取的代码，遇到了循环抓取多页数据的问题，而价格栏又出现了错误的价格，请您能帮助我如何解决这些问题。这是我的代码：

# -*- coding: utf-8 -*-
import scrapy


class CurtainTestSpider(scrapy.Spider):
    name = 'curtain_test'
    allowed_domains = ['www.redbubble.com']
    #start_urls = ['https://www.redbubble.com/shop/shower-curtains/']
    def start_requests(self):
        yield scrapy.Request(url='https://www.redbubble.com/shop/shower-curtains/', callback=self.parse, headers={
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
        })
    def parse(self, response):
        products = response.xpath("//div[@class='styles__grid--197Ps']/a")
        for product in products:
            link = product.xpath(".//@href").get()
            yield scrapy.Request(url=link, callback=self.parse_item, headers={
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
        })
    def parse_item(self, response):
        title = response.xpath("//h1/text()").get()
        price = response.xpath("//div[@class='ProductConfiguration__price--31GDW']/span/span/text()").get()
        img = response.xpath("(//img[@class='GalleryImage__img--12Vov'])[3]/@src").get()
        yield {
            'Title' :   title,
            'price' :   price,
            'Img_Url'   :   img
        }
        next_page = response.xpath("(//a[@class='Pagination__namedLink--1dOFn'])[2]/@href").get()
        #absolute_url = f"https://www.redbubble.com{next_page}"
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.parse, headers={
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
        })

使用Scrapy Web Scraping循环抓取多个页面时出现问题

0 个答案: