Scrapy crawlspider规则不适用于下一页

时间:2016-05-02 07:07:32

标签: python scrapy

我正在尝试使用crawlspider抓取下一页链接,但如果我将解析函数更改为其他内容,则无法获得任何结果。我的规则不起作用。我只能用解析函数获取当前页面。我哪里出错了。

这是我的naukri_spider.py文件

import scrapy

from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from naukri.items import NaukriItem

class NaukriSpider(Spider):
    name = "naukri"
    allowed_domains = ["naukri.com"]
    start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]

    rules = (
    Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[@class="pagination"]/a/button[@class="grayBtn"]',)), callback="parse", follow= True),
)

    def parse(self,response):
        for sel in response.xpath('//*[@class="content"]'):
            item = NaukriItem()
            item['title'] = sel.xpath('span[@class="desig"]/text()').extract()
            item['location'] = sel.xpath('span[@class="loc"]/span/text()').extract()
            item['organization'] = sel.xpath('span[@class="org"]/text()').extract()
            yield item

1 个答案:

答案 0 :(得分:0)

parse实施的CrawlSpider方法用于关注链接。将规则回调更改为parse_start_url并覆盖它。

此代码工作正常。

import scrapy

from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from stackoverflow.items import NaukriItem

class NaukriSpider(CrawlSpider):
    name = "naukri"
    allowed_domains = ["naukri.com"]
    start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]

    rules = (
        Rule(SgmlLinkExtractor(allow=('information-technology-jobs.*', )), callback="parse_start_url", follow= True),
    )

    def parse_start_url(self,response):
        for sel in response.xpath('//*[@class="content"]'):
            item = NaukriItem()
            item['title'] = sel.xpath('span[@class="desig"]/text()').extract()
            item['location'] = sel.xpath('span[@class="loc"]/span/text()').extract()
            item['organization'] = sel.xpath('span[@class="org"]/text()').extract()
            yield item