Question

我试图解析找工作的网络。这个过程是这样的;

请求第一个作业列表页面（def start_request ）
通过回调解析作业列表页面到 parse_list 函数
对于作业列表中的每个作业URL，记录“请求{url}＆＃39;”，然后通过回调请求 parse_detail 功能。日志看起来像这样

2018-04-21 13：49：54,211： - [JobPageRequest] https://www.jobant.com/job-3998

parse_detail函数记录已成功调用parse_detail，然后开始解析详细信息。日志看起来像这样

2018-04-21 13：52：57,494：求职者 - [JobPageParsing] https://www.jobant.com/job-3998

找到当前作业列表页面中的下一页链接，如果存在，转到2，否则作业结束。

问题是，回调有时 求职网站包含64个职位，但我只有49个职位，所以我查看我的日志 [JobPageRequest]已准确记录64次，与网站中的作业数量相同，但[JobPageParsing]仅记录了49次。

我已经尝试了几次，结果完全相同，64次中有49页。未被调用的网址也完全相同，但是对于已经成功调用的页面没有特定的模式/差异，无论如何我都能看到。

所以，在我看来，由于某些原因，没有调用那些特定的页面。

以下是代码的相关部分。

start_requests

def start_requests(self): '''start first request on a job-list page''' url = "https://www.jobant.com/jobs-search.php?s_jobtype={job_type}&s_province={province}&page={page}" job_type = self.job_type if hasattr(self,'job_type') else '' province = self.province if hasattr(self,'province') else '' formatted_url = url.format(page=self.page, job_type=job_type, province=province) self.logger.info('[ JobListRequest ] {url}'.format(url=formatted_url.encode('utf-8'))) # callback to parse_list yield scrapy.Request(url=formatted_url.encode('utf-8'), callback=self.parse_list)

parse_list

def parse_list(self, response): if self.killed: raise CloseSpider("Spider already died.") ### getting job urls from job list page. jobs = response.xpath('//div[@class="item"]/div/div/div/a/@href').extract() ### for each job page, request for html for job_id in jobs: url = urljoin("https://www.jobant.com/",job_id) # the use_proxy is hard-coded as False atm if self.use_proxy: proxy = choice(self.proxies) self.logger.info('[ JobPageRequest ] {url} with proxy {proxy}'.format(url=url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(url, callback=self.parse_detail , meta={'proxy': proxy}) else: self.logger.info('[ JobPageRequest ] {url}'.format(url=url.encode('utf-8'))) # callback to parse_detail yield scrapy.Request(url, callback=self.parse_detail) # the rest is about finding next job-list page

解析细节部分并不重要，唯一相关的部分是我开始记录函数内的第一个东西

def parse_detail(self, response): self.logger.info('[ JobPageParsing ] {url}'.format(url=response.url.encode('utf-8'))) ## .. The rest is not relevant

这是我的完整代码，以防错误发生在其他地方。

import scrapy from datetime import datetime from scrapy.utils.markup import remove_tags from scrapy.http import FormRequest from urlparse import urljoin from scrapy.exceptions import CloseSpider from random import choice from hasher import hash_dn from sqlalchemy import exc class TDRISpider(scrapy.Spider): custom_settings = { 'HTTPPROXY_ENABLED': True } name = "jobant" page = 1 web_id = 1 ## some variables set up by a factory script on run. logger = None sqllogger = None html_path = None max_page = 9999 use_proxy = False proxies = [] ## variables to track repeat / error repeat_count = 0 repeat_threshold = 3 error_count = 0 error_threshold = 5 killed = 0 def start_requests(self): '''start first request on a job-list page''' url = "https://www.jobant.com/jobs-search.php?s_jobtype={job_type}&s_province={province}&page={page}" job_type = self.job_type if hasattr(self,'job_type') else '' province = self.province if hasattr(self,'province') else '' formatted_url = url.format(page=self.page, job_type=job_type, province=province) self.logger.info('[ JobListRequest ] {url}'.format(url=formatted_url.encode('utf-8'))) yield scrapy.Request(url=formatted_url.encode('utf-8'), callback=self.parse_list) def clean_tag(self,s): return ' '.join([x.strip() for x in remove_tags(s).split()]) def parse_list(self, response): if self.killed: raise CloseSpider("Spider already died.") ### getting job urls from job list page. jobs = response.xpath('//div[@class="item"]/div/div/div/a/@href').extract() ### for each job page, request for html for job_id in jobs: url = urljoin("https://www.jobant.com/",job_id) if self.use_proxy: proxy = choice(self.proxies) self.logger.info('[ JobPageRequest ] {url} with proxy {proxy}'.format(url=url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(url, callback=self.parse_detail , meta={'proxy': proxy}) else: self.logger.info('[ JobPageRequest ] {url}'.format(url=url.encode('utf-8'))) yield scrapy.Request(url, callback=self.parse_detail) ### getting next job list page url next_url = response.xpath('//ul[@class="pagination"]//a/@href').extract() if len(next_url) == 2: next_url = next_url[-1] elif len(next_url) == 1 and self.page <2: next_url = next_url[0] else: next_url = None ### request next job list, if it exists if next_url and self.page <= self.max_page: next_page = urljoin("https://www.jobant.com/",next_url) self.page += 1 self.logger.info('[ JobListRequest ] {url}'.format(url=next_page.encode('utf-8'))) yield scrapy.Request(url=next_page.encode('utf-8'), callback=self.parse_list) elif next_url: self.logger.info('[ JobEndReached ] Max page reached at # %d' % self.max_page) raise CloseSpider("Max page reached") else: self.logger.info('[ JobEndReached ] Last page reached at # %d' % self.page) raise CloseSpider("Last page reached") def parse_detail(self, response): self.logger.info('[ JobPageParsing ] {url}'.format(url=response.url.encode('utf-8'))) if self.killed: raise CloseSpider("Spider already died.") ### handle the case when response from web server is empty # retry requesting, after 5 failures in a row, log error then continue. if not response.body: self.error_count += 1 if self.error_count >= self.error_threshold: self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url.encode('utf-8'))) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message= "Empty request's response" ) yield None return if self.use_proxy: proxy = choice(self.proxies) self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url.encode('utf-8'), proxy=proxy)) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail , meta={'proxy': proxy}) return else: self.logger.info('[ JobPageRetry ] {url}'.format(url=url.encode('utf-8'))) yield scrapy.Request(response.url.encode('utf-8'), callback=self.parse_detail) return self.error_count = 0 ### ### writing html archive try: html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S')) with open(html_path, 'w') as f: f.write(response.text.encode('utf-8')) self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url.encode('utf-8'))) except Exception as e: self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url.encode('utf-8'))) ### try: ### parsing information contents = response.xpath('.//div[@class="wrapper-preview-list"]/div[contains(@class,"row tr")]/div[contains(@class,"col-sm")]') content_str = [self.clean_tag(content.xpath('./div/div')[1].extract()) for content in contents[:10]] pos, company = [x.strip() for x in response.xpath('//h1[@class="title-section c4 xs-mt5"]/text()').extract_first().split(',',1)] ret = {} ret['company'] = company ret['pos'] = pos ret['etype'] = content_str[1] ret['indus'] = content_str[2] ret['amnt'] = content_str[3] ret['sal'] = content_str[4] ret['exp'] = content_str[5] ret['sex'] = content_str[6] ret['edu'] = content_str[7] ret['loc'] = content_str[8] ret['desc'] = '|'.join([x.strip() for x in contents[11].xpath('./text()').extract()]) ret['pdate'] = response.xpath('//span[@itemprop="datePosted"]/text()').extract_first() if ret['pdate'].split('/')[-1] == "2017": self.logger.info("[ JobEndReached ] 2017 reached") self.killed = 1 raise CloseSpider("2017 reached") for key in ret.keys(): if ret[key]: ret[key] = ret[key].strip().replace('%','%%').encode('utf-8') ### # create hash for tracking jobs _hash = hash_dn(ret['desc'],ret['company']) ### log result to MySQL try: self.sqllogger.log_crawled_page( hash_code = _hash, position = ret['pos'], employer = ret['company'], exp = ret['exp'], salary = ret['sal'], location = ret['loc'], web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), post_time = ret['pdate'], job_status = 'SUCCESS', error_message= '' ) self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8'))) except exc.IntegrityError as e: ### check encountering old record by catching error that mysql will throw # if old record is met. (primary key(hash) is repeating) # The error code for such error is 1062 ### Stop spider after encountering crawled record 3 times IN A ROW. # to prevent spider stopping just from getting a few old records # that may happen because of new job updates if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold: self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold") self.killed = 1 raise CloseSpider("Crawled record reached") elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold: self.repeat_count += 1 self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count) yield None return else: raise e ### self.repeat_count = 0 ### yield ret except CloseSpider as e: raise CloseSpider(e.message) except Exception as e: self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e)) self.sqllogger.log_error_page( hash_code = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')), web_id = self.web_id, url = response.url.encode('utf-8'), meta = response.meta, html_path = html_path, crawl_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S'), job_status = 'FAILED', error_message= e )

Answer 1

所以在我输入问题时，我发现了我的错误，这个错误很愚蠢，但可能对其他人有用。

在 parse_list 函数中，我有这部分代码检测上一个作业列表页面

if next_url and self.page <= self.max_page:
    next_page = urljoin("https://www.jobant.com/",next_url)
    self.page += 1
    self.logger.info('[ JobListRequest ] {url}'.format(url=next_page.encode('utf-8')))
    yield scrapy.Request(url=next_page.encode('utf-8'), callback=self.parse_list)
elif next_url:
    self.logger.info('[ JobEndReached ] Max page reached at # %d' % self.max_page)
    raise CloseSpider("Max page reached")
else:
    self.logger.info('[ JobEndReached ] Last page reached at # %d' % self.page)
    raise CloseSpider("Last page reached")

这是我的错误，

当我手动提升CloseSpider异常以停止抓取时，它会停止已请求的抓取但尚未启动。

自从我进行实验并发现CloseSpider加注并没有立即杀死蜘蛛之后，这是不明显的，所以我错误地认为如果在SpiderClose之前已经请求了它，它最终会完成。

Scrapy回调函数不起作用＆＃34;有时＆＃34;

1 个答案: