全部
我为Indeed工作创建了一个刮板,以学习Python及其周围的一些机会。所以我终于使刮板正常工作了,但是刮板总是停止约1700个工作,而在确实存在的情况下,有时会有超过100.000个工作。代码中是否有任何显示错误的地方?
from scrapy.selector import HtmlXPathSelector
from scrapy.spiders import CrawlSpider
from scrapy.http import Request
from indeed.items import IndeedItem
from scrapy.loader import ItemLoader
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["www.indeed.nl"]
# manage date range
# location (USA/Canada only),
# and terms (full time only)
def start_requests(self):
args = dict(
title=getattr(self, 'job', 'data analyst'), # q
location=getattr(self, 'loc', ''), # rbl
jobtype=getattr(self, 'jobtype', None), # jt
age=getattr(self, 'age', None), # fromage
)
url = "https://www.indeed.nl/jobs?q={title}&rbl={location}&jt={jobtype}".format(
**args)
yield Request(url, self.parse)
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.xpath('//div[contains(@class,"row")]')
for site in sites:
item = IndeedItem()
# company
company = site.xpath(
".//span[@class='company']//a/text()").extract_first()
if not company:
company = site.xpath(
".//span[@class='company']/text()").extract_first()
item['company'] = company.strip()
# Location
location = site.xpath(
".//span[@class='location']//a/text()").extract_first()
if not location:
location = site.xpath(
".//span[@class='location']/text()").extract_first()
if not location:
location = site.xpath(
".//div[@class='location']/text()").extract_first()
item['location'] = location
# title
title = site.xpath(
'.//a[@data-tn-element="jobTitle"]/@title[1]').extract_first()
item['title'] = title
# Posting date
date = site.xpath(
".//span[@class='date']//a/text()").extract_first()
if not date:
date = site.xpath(
".//span[@class='date']/text()").extract_first()
if not date:
date = site.xpath(
".//div[@class='date']/text()").extract_first()
if not date:
date = site.xpath(
".//span[@class=' sponsoredGray ']/text()").extract_first()
item['date'] = date
# indeed url
#link = site.xpath(
# ".//span[@class='company']//a/@href").extract_first()
#if link:
#item['link'] = 'https://www.indeed.nl' + link
yield item
# what to crawl next
next_to_crawl = hxs.xpath(
'//span[@class="pn"]/parent::a/@href').extract()
for i in next_to_crawl:
url = response.urljoin(i)
yield Request(url)
这是显示的日志:
2019-02-08 23:22:07 [scrapy.core.engine] INFO: Closing spider (finished)
2019-02-08 23:22:07 [scrapy.extensions.feedexport] INFO: Stored csv feed (1589 items) in: data2.csv
2019-02-08 23:22:07 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 71580,
'downloader/request_count': 102,
'downloader/request_method_count/GET': 102,
'downloader/response_bytes': 6642752,
'downloader/response_count': 102,
'downloader/response_status_count/200': 101,
'downloader/response_status_count/301': 1,
'dupefilter/filtered': 15355,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 2, 8, 22, 22, 7, 746299),
'httpcache/hit': 102,
'item_scraped_count': 1589,
'log_count/DEBUG': 1693,
'log_count/INFO': 10,
'log_count/WARNING': 2,
'memusage/max': 51048448,
'memusage/startup': 51048448,
'request_depth_max': 26,
'response_received_count': 101,
'scheduler/dequeued': 102,
'scheduler/dequeued/memory': 102,
'scheduler/enqueued': 102,
'scheduler/enqueued/memory': 102,
'start_time': datetime.datetime(2019, 2, 8, 22, 21, 58, 208282)}
2019-02-08 23:22:07 [scrapy.core.engine] INFO: Spider closed (finished)
显示的警告:
ScrapyDeprecationWarning: scrapy.selector.HtmlXPathSelector is deprecated, instantiate scrapy.Selector instead. hxs = HtmlXPathSelector(response) 2019-02-11 09:55:20 [py.warnings] WARNING: /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/parsel/selector.py:250: ScrapyDeprecationWarning: scrapy.selector.HtmlXPathSelector is deprecated, instantiate scrapy.Selector instead. for x in result]