Question

我正在尝试使用scrapy抓取网站。当我抓取特定页面时，分页抓取有效，但是当我尝试一次跳转分页来抓取所有页面时，则不起作用。
我尝试为分页创建一个额外的功能，但这不能解决问题。所有帮助将不胜感激。我究竟做错了什么？这是我的代码：

# -*- coding: utf-8 -*-
import scrapy

from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.http import Request

from avtogumi.items import AvtogumiItem


class BasicSpider(scrapy.Spider):
    name = 'gumi'
    allowed_domains = ['avtogumi.bg']
    start_urls = ['https://bg.avtogumi.bg/oscommerce/index.php' ]

    def parse(self, response):

        urls = response.xpath('//div[@class="brands"]//a/@href').extract()
        for url in urls:
            url = response.urljoin(url)
            yield scrapy.Request(url=url, callback=self.parse_params)


    def parse_params(self, response):

        l = ItemLoader(item=AvtogumiItem(), response=response)

        l.add_xpath('title', '//h4/a/text()')
        l.add_xpath('subtitle', '//p[@class="ft-darkgray"]/text()')
        l.add_xpath('price', '//span[@class="promo-price"]/text()',
            MapCompose(str.strip, str.title))
        l.add_xpath('stock', '//div[@class="product-box-stock"]//span/text()')
        l.add_xpath('category', '//div[@class="labels hidden-md hidden-lg"][0]//text()')
        l.add_xpath('brand', '//h4[@class="brand-header"][0]//text()', 
            MapCompose(str.strip, str.title))
        l.add_xpath('img_path', '//div/img[@class="prod-imglist"]/@src')

        yield l.load_item()

        next_page_url = response.xpath('//li/a[@class="next"]/@href').extract_first()
        if next_page_url:
            next_page_url = response.urljoin(next_page_url)
            yield scrapy.Request(url=next_page_url, callback=self.parse_params)

Answer 1

使用/重写此代码

SIGSEGV

13407件商品报废

Answer 2

这里的问题是这样：

l = ItemLoader(item=AvtogumiItem(), response=response)

l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[@class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[@class="promo-price"]/text()',
    MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[@class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[@class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[@class="brand-header"][0]//text()', 
    MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[@class="prod-imglist"]/@src')

yield l.load_item()

此代码段将解析并仅加载一个结果。如果您的页面上有多个结果，则必须将该代码放入for循环中，并遍历要解析的所有搜索结果：

objects = response.xpath('my_selector_here')
for object in objects:
    l = ItemLoader(item=AvtogumiItem(), response=response)

    l.add_xpath('title', '//h4/a/text()')
    l.add_xpath('subtitle', '//p[@class="ft-darkgray"]/text()')
    l.add_xpath('price', '//span[@class="promo-price"]/text()',
        MapCompose(str.strip, str.title))
    l.add_xpath('stock', '//div[@class="product-box-stock"]//span/text()')
    l.add_xpath('category', '//div[@class="labels hidden-md hidden-lg"][0]//text()')
    l.add_xpath('brand', '//h4[@class="brand-header"][0]//text()', 
        MapCompose(str.strip, str.title))
    l.add_xpath('img_path', '//div/img[@class="prod-imglist"]/@src')

    yield l.load_item()

希望这会有所帮助

多重分页失败导致分页失败

2 个答案: