为什么scrapy 只返回部分结果?

时间:2021-03-29 14:17:00

标签: python scrapy

我对python和scrapy比较陌生。 我正在尝试抓取工作门户 https://www.jobs.ch/de/。目前我从 https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/ 开始。 目前,刮板工作正常,但未返回所有作业结果。在每页 24 个结果中,scrapy 返回交替数量的结果(用 2 页测试:21/24 和 23/24)。我检查了缺失结果的 css 路径是否不同,但它们是相同的。有人知道为什么我没有得到所有结果吗?非常感谢所有建议。

import scrapy
from jobscraping.items import JobscrapingItem


class GetdataSpider(scrapy.Spider):
    name = 'getdata5'
    start_urls = ['https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/']

    def parse(self, response):
        yield from self.scrape(response)

        next_page = response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.itnOWY > a.sc-fznxsB.fvMaWZ.Link-sc-1vy3ms6-1.fvbIfL:last-child').attrib['href']
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

    def scrape(self, response):
        for add in response.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd'):
            item = JobscrapingItem()
            addpage = response.urljoin(add.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd a::attr(href)').get(default='not-found'))
            item['addlink'] = addpage
            item['Position'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.VacancySerpItem___StyledText-qr45cp-6.gHnsfC::text').get(default='not-found')
            item['Company'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY > strong::text').get(default='not-found')
            item['Location'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY::text').get(default='not-found')

            request1 = scrapy.Request(addpage, callback=self.get_addinfos)
            request1.meta['item'] = item
            yield request1

    def get_addinfos(self, response):
        for details in response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.VacancyDetailHead__StyledVacancyDetailHead-sc-14lkltl-0.VacancyDetailHead___StyledStyledVacancyDetailHead-sc-14lkltl-1.deEQGn'):
            item = response.meta['item']
            companypage = response.urljoin(details.css('div.sc-AxiKw.XkVWn > span > div > a::attr(href)').get(default='not-found'))
            item['companylink'] = companypage
            item['Date'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(1) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
            item['Rank'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(2) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
            item['Workload'] = details.css('span.sc-fzqNJr.Badge-sc-88vuku-0.dCIQfi::text').get(default='not-found')

            request2 = scrapy.Request(companypage, callback=self.get_companyinfos)
            request2.meta['item'] = item
            yield request2

    def get_companyinfos(self, response):
        item = response.meta['item']
        item['Industry'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.leHDqM::text').get(default='not-found')
        item['Open_jobs'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.fMPCQO > span::text').get(default='not-found')
        item['Employees'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.GqJfV > span::text').get(default='not-found')
        item['Rating_overall'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.gKcdxd::text').get(default='not-found')
        item['Rating_detailed'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.hVUXAg::text').getall()
        item['Rating_numbers'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.kxNaWG::text').get(default='not-found')
        yield item

items.py 文件:

import scrapy


class JobscrapingItem(scrapy.Item):
    # define the fields for your item here like:
    addlink = scrapy.Field()
    Position = scrapy.Field()
    Company = scrapy.Field()
    Location = scrapy.Field()
    Date = scrapy.Field()
    Rank = scrapy.Field()
    Workload = scrapy.Field()
    companylink = scrapy.Field()
    Industry = scrapy.Field()
    Open_jobs = scrapy.Field()
    Employees = scrapy.Field()
    Rating_overall = scrapy.Field()
    Rating_detailed = scrapy.Field()
    Rating_numbers = scrapy.Field()

1 个答案:

答案 0 :(得分:0)

我在我的代码中发现了错误。由于一些公司发布了不止一个帖子,因此刮板删除了重复的帖子。我设置了 dont_filter=True,这解决了问题。我还为没有公司链接的帖子添加了 if 语句,因此抓取工具会在继续抓取公司页面之前生成这些项目。