我正在尝试制作一个利用分页但没有成功的scrapy机器人......
机器人抓取第一页上的所有链接但从未进入下一页。我已经阅读了大量不同的主题,我根本无法解决这个问题。我是网站抓取的新手,请随意从我的代码中剔除废话。
import time
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http.request import Request
from tutorial.items import TutorialItem
#from scrapy_tutorial.items import ScrapyTutorialItem
class raytheonJobsPageSpider(CrawlSpider):
name = "raytheonJobsStart"
allowed_domains = ["jobs.raytheon.com"]
start_urls = [
"https://jobs.raytheon.com/search-jobs"
]
rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[@class="next"]',)), callback='parse_listings',follow=True), )
def parse_start_url(self, response):
'''
Crawl start URLs
'''
return self.parse_listings(response)
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[@id="search-results-list"]/ul/*/a/@href'
).extract()
nextLink = response.xpath('//a[@class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
'''
Extract data from details pages
'''
sel = Selector(response)
job = sel.xpath('//*[@id="content"]')
item = TutorialItem()
# Populate job fields
item['title'] = job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
jobTitle=job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
item['reqid'] = job.xpath('//*[@id="content"]/section[1]/div/span[1]/text()').extract()
item['location'] = job.xpath('//*[@id="content"]/section[1]/div/span[last()]/text()').extract()
item['applink'] = job.xpath('//*[@id="content"]/section[1]/div/a[2]/@href').extract()
item['description'] = job.xpath('//*[@id="content"]/section[1]/div/div').extract()
item['clearance'] = job.xpath('//*[@id="content"]/section[1]/div/*/text()').extract()
#item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
time.sleep(1)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
print self,value
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value
答案 0 :(得分:1)
你不需要PhantomJS或Splash。
通过检查AJAX调用,我发现他们正在通过AJAX调用将作业加载到this URL
您可以在网址末尾看到CurrentPage
参数。
结果以JSON格式返回,所有作业都在名为results
的键上
我在我身边创建了一个项目,并为您创建了完全100%的工作代码。这里是github中的链接,只需下载并运行它......你根本不需要做任何事情:P
在此处下载整个工作项目https://github.com/mani619cash/raytheon_pagination
基本逻辑就在这里 类RaytheonspiderSpider(CrawlSpider):
name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="
def start_requests(self):
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
def parse_listings(self, response):
resp = json.loads(response.body)
response = Selector(text = resp['results'])
jobs = response.xpath('//*[@id="search-results-list"]/ul/*/a/@href').extract()
if jobs:
for job_url in jobs:
job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
#job_url = self.__to_absolute_url(response.url, job_url)
yield Request(url=job_url, callback=self.parse_details)
else:
raise CloseSpider("No more pages... exiting...")
# go to next page...
self.page = self.page + 1
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
答案 1 :(得分:0)
更改
restrict_xpaths=('//div[@class="next"]',))
到
restrict_xpaths=('//a[@class="next"]',))
如果这不起作用,则对parse_listings函数进行递归调用
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[@id="search-results-list"]/ul/*/a/@href'
).extract()
nextLink = response.xpath('//a[@class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
yield Request(pagination link here, callback=self.parse_listings)
我在移动设备上这么无法输入代码。我希望我告诉你的逻辑是有道理的