我对python和scrapy比较陌生。 我正在尝试抓取工作门户 https://www.jobs.ch/de/。目前我从 https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/ 开始。 目前,刮板工作正常,但未返回所有作业结果。在每页 24 个结果中,scrapy 返回交替数量的结果(用 2 页测试:21/24 和 23/24)。我检查了缺失结果的 css 路径是否不同,但它们是相同的。有人知道为什么我没有得到所有结果吗?非常感谢所有建议。
import scrapy
from jobscraping.items import JobscrapingItem
class GetdataSpider(scrapy.Spider):
name = 'getdata5'
start_urls = ['https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/']
def parse(self, response):
yield from self.scrape(response)
next_page = response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.itnOWY > a.sc-fznxsB.fvMaWZ.Link-sc-1vy3ms6-1.fvbIfL:last-child').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def scrape(self, response):
for add in response.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd'):
item = JobscrapingItem()
addpage = response.urljoin(add.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd a::attr(href)').get(default='not-found'))
item['addlink'] = addpage
item['Position'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.VacancySerpItem___StyledText-qr45cp-6.gHnsfC::text').get(default='not-found')
item['Company'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY > strong::text').get(default='not-found')
item['Location'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY::text').get(default='not-found')
request1 = scrapy.Request(addpage, callback=self.get_addinfos)
request1.meta['item'] = item
yield request1
def get_addinfos(self, response):
for details in response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.VacancyDetailHead__StyledVacancyDetailHead-sc-14lkltl-0.VacancyDetailHead___StyledStyledVacancyDetailHead-sc-14lkltl-1.deEQGn'):
item = response.meta['item']
companypage = response.urljoin(details.css('div.sc-AxiKw.XkVWn > span > div > a::attr(href)').get(default='not-found'))
item['companylink'] = companypage
item['Date'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(1) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Rank'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(2) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Workload'] = details.css('span.sc-fzqNJr.Badge-sc-88vuku-0.dCIQfi::text').get(default='not-found')
request2 = scrapy.Request(companypage, callback=self.get_companyinfos)
request2.meta['item'] = item
yield request2
def get_companyinfos(self, response):
item = response.meta['item']
item['Industry'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.leHDqM::text').get(default='not-found')
item['Open_jobs'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.fMPCQO > span::text').get(default='not-found')
item['Employees'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.GqJfV > span::text').get(default='not-found')
item['Rating_overall'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.gKcdxd::text').get(default='not-found')
item['Rating_detailed'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.hVUXAg::text').getall()
item['Rating_numbers'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.kxNaWG::text').get(default='not-found')
yield item
items.py 文件:
import scrapy
class JobscrapingItem(scrapy.Item):
# define the fields for your item here like:
addlink = scrapy.Field()
Position = scrapy.Field()
Company = scrapy.Field()
Location = scrapy.Field()
Date = scrapy.Field()
Rank = scrapy.Field()
Workload = scrapy.Field()
companylink = scrapy.Field()
Industry = scrapy.Field()
Open_jobs = scrapy.Field()
Employees = scrapy.Field()
Rating_overall = scrapy.Field()
Rating_detailed = scrapy.Field()
Rating_numbers = scrapy.Field()
答案 0 :(得分:0)
我在我的代码中发现了错误。由于一些公司发布了不止一个帖子,因此刮板删除了重复的帖子。我设置了 dont_filter=True
,这解决了问题。我还为没有公司链接的帖子添加了 if 语句,因此抓取工具会在继续抓取公司页面之前生成这些项目。