Scrapy分页问题 - 这个东西的新功能

时间:2016-12-01 02:06:32

标签: python web-scraping scrapy

我正在尝试制作一个利用分页但没有成功的scrapy机器人......

机器人抓取第一页上的所有链接但从未进入下一页。我已经阅读了大量不同的主题,我根本无法解决这个问题。我是网站抓取的新手,请随意从我的代码中剔除废话。

    import time
    from scrapy.spiders import CrawlSpider, Rule
    #from scrapy.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.contrib.linkextractors import LinkExtractor
    from scrapy.selector import Selector
    from scrapy.http.request import Request
    from tutorial.items import TutorialItem


    #from scrapy_tutorial.items import ScrapyTutorialItem

    class raytheonJobsPageSpider(CrawlSpider):

        name = "raytheonJobsStart"
        allowed_domains = ["jobs.raytheon.com"]
        start_urls = [
            "https://jobs.raytheon.com/search-jobs"
        ]

        rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[@class="next"]',)), callback='parse_listings',follow=True), )

        def parse_start_url(self, response):
            '''
            Crawl start URLs
            '''

            return self.parse_listings(response)

        def parse_listings(self, response):
            '''
            Extract data from listing pages
            '''

            sel = Selector(response)
            jobs = response.xpath(
                '//*[@id="search-results-list"]/ul/*/a/@href'
            ).extract()
            nextLink = response.xpath('//a[@class="next"]').extract()
            print "This is just the next page link - ",nextLink

            for job_url in jobs:
                job_url = self.__normalise(job_url)
                job_url = self.__to_absolute_url(response.url, job_url)

                yield Request(job_url, callback=self.parse_details)

        def parse_details(self, response):
            '''
            Extract data from details pages
            '''


            sel = Selector(response)
            job = sel.xpath('//*[@id="content"]')
            item = TutorialItem()
            # Populate job fields
            item['title'] = job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
            jobTitle=job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
            item['reqid'] = job.xpath('//*[@id="content"]/section[1]/div/span[1]/text()').extract()
            item['location'] = job.xpath('//*[@id="content"]/section[1]/div/span[last()]/text()').extract()
            item['applink'] = job.xpath('//*[@id="content"]/section[1]/div/a[2]/@href').extract()
            item['description'] = job.xpath('//*[@id="content"]/section[1]/div/div').extract()
            item['clearance'] = job.xpath('//*[@id="content"]/section[1]/div/*/text()').extract()
            #item['page_url'] = response.url
            item = self.__normalise_item(item, response.url)
            time.sleep(1)
            return item

        def __normalise_item(self, item, base_url):
            '''
            Standardise and format item fields
            '''

            # Loop item fields to sanitise data and standardise data types
            for key, value in vars(item).values()[0].iteritems():
                item[key] = self.__normalise(item[key])

            # Convert job URL from relative to absolute URL
            #item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])

            return item

        def __normalise(self, value):
            print self,value
            # Convert list to string
            value = value if type(value) is not list else ' '.join(value)
            # Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
            value = value.strip()

            return value

        def __to_absolute_url(self, base_url, link):
            '''
            Convert relative URL to absolute URL
            '''

            import urlparse

            link = urlparse.urljoin(base_url, link)

            return link

        def __to_int(self, value):
            '''
            Convert value to integer type
            '''

            try:
                value = int(value)
            except ValueError:
                value = 0

            return value

        def __to_float(self, value):
            '''
            Convert value to float type
            '''

            try:
                value = float(value)
            except ValueError:
                value = 0.0

            return value

2 个答案:

答案 0 :(得分:1)

你不需要PhantomJS或Splash。

通过检查AJAX调用,我发现他们正在通过AJAX调用将作业加载到this URL

您可以在网址末尾看到CurrentPage参数。

结果以JSON格式返回,所有作业都在名为results的键上

我在我身边创建了一个项目,并为您创建了完全100%的工作代码。这里是github中的链接,只需下载并运行它......你根本不需要做任何事情:P

在此处下载整个工作项目https://github.com/mani619cash/raytheon_pagination

基本逻辑就在这里 类RaytheonspiderSpider(CrawlSpider):

name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="

def start_requests(self):
    yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)

def parse_listings(self, response):
    resp = json.loads(response.body)

    response = Selector(text = resp['results'])

    jobs = response.xpath('//*[@id="search-results-list"]/ul/*/a/@href').extract()
    if jobs:
        for job_url in jobs:
            job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
            #job_url = self.__to_absolute_url(response.url, job_url)
            yield Request(url=job_url, callback=self.parse_details)
    else:
        raise CloseSpider("No more pages... exiting...")

    # go to next page...
    self.page = self.page + 1
    yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)

答案 1 :(得分:0)

更改

restrict_xpaths=('//div[@class="next"]',))

restrict_xpaths=('//a[@class="next"]',))

如果这不起作用,则对parse_listings函数进行递归调用

def parse_listings(self, response):
        '''
        Extract data from listing pages
        '''

        sel = Selector(response)
        jobs = response.xpath(
            '//*[@id="search-results-list"]/ul/*/a/@href'
        ).extract()
        nextLink = response.xpath('//a[@class="next"]').extract()
        print "This is just the next page link - ",nextLink

        for job_url in jobs:
            job_url = self.__normalise(job_url)
            job_url = self.__to_absolute_url(response.url, job_url)

            yield Request(job_url, callback=self.parse_details)

       yield Request(pagination link here, callback=self.parse_listings)

我在移动设备上这么无法输入代码。我希望我告诉你的逻辑是有道理的