我正在尝试使用crawlspider抓取下一页链接,但如果我将解析函数更改为其他内容,则无法获得任何结果。我的规则不起作用。我只能用解析函数获取当前页面。我哪里出错了。
这是我的naukri_spider.py文件
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from naukri.items import NaukriItem
class NaukriSpider(Spider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[@class="pagination"]/a/button[@class="grayBtn"]',)), callback="parse", follow= True),
)
def parse(self,response):
for sel in response.xpath('//*[@class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[@class="desig"]/text()').extract()
item['location'] = sel.xpath('span[@class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[@class="org"]/text()').extract()
yield item
答案 0 :(得分:0)
parse
实施的CrawlSpider
方法用于关注链接。将规则回调更改为parse_start_url
并覆盖它。
此代码工作正常。
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from stackoverflow.items import NaukriItem
class NaukriSpider(CrawlSpider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=('information-technology-jobs.*', )), callback="parse_start_url", follow= True),
)
def parse_start_url(self,response):
for sel in response.xpath('//*[@class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[@class="desig"]/text()').extract()
item['location'] = sel.xpath('span[@class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[@class="org"]/text()').extract()
yield item