如何在我的解析中调用下一页 - Scrapy

时间:2016-05-02 10:31:57

标签: python-2.7 scrapy scrapy-spider

我已经尝试了所有的东西,但我似乎无法弄清楚如何调用parse_category中的下一页。

当我直接进入一个catergory页面时,我尝试过LinkExtractor,但这不起作用。

import scrapy.selector 
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from msh_final.items import CrawlerMshFinalItem


def complete_url(string):
    return "http://www.mediamarkt.be" + string


def get_base_url(url):
    if url != "":
        u = urlparse.urlparse(url)
        return "%s://%s" % (u.scheme, u.netloc)
    else:
        return ""


def encode(str):
    return str.encode('utf8', 'ignore')


class msh_finalSpider(CrawlSpider):
    name = 'msh_final'
    start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']

    def parse(self, response):
        items = response.xpath('//ul[@class="infield cf"]//div[@class="infield-wrapper"]/h2/a/@href')
        for item in items:
            link = item.extract()
            yield Request(complete_url(link), callback=self.parse_category)

    def parse_category(self, response):
        items = response.xpath("//ul[@class='products-list']/li/div")
        for item in items:
            msh = CrawlerMshFinalItem()
            msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
            msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
            yield msh

1 个答案:

答案 0 :(得分:0)

您应该从Spider而不是CrawlSpider继承您的蜘蛛并使用以下代码:

class msh_finalSpider(Spider):
    name = 'msh_final'
    start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']

    def parse(self, response):
        items = response.xpath('//ul[@class="infield cf"]//div[@class="infield-wrapper"]/h2/a/@href')
        for item in items:
            link = item.extract()
            yield Request(complete_url(link), callback=self.parse_category)

    def parse_category(self, response):
        items = response.xpath("//ul[@class='products-list']/li/div")
        for item in items:
            msh = CrawlerMshFinalItem()
            msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
            msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
            yield msh

        new_link = response.xpath('//li[@class="pagination-next"]/a/@href').extract()[0]
        yield Request(
            complete_url(new_link),
            callback=self.parse_category
        )