scrapy:“加载更多结果”页面

时间:2018-02-11 06:39:59

标签: python-3.x web-scraping scrapy scrapy-spider

我试图编写下面的scrapy脚本来搜索下面的网站上的项目。 我能够废弃第一页的项目,但有更多关于2000页我想要废弃所有。 有一个选项“加载更多的结果”,我也试图废弃加载更多结果的页面,但无法做到这一点。 请帮我。

from scrapy.shell import open_in_browser
import scrapy
from scrapy import Selector
import math
import json

class MyItems(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    link  = scrapy.Field()

class ProductSpider(scrapy.Spider):
    name= 'reuters'
    allowed_domains = ['reuters.com']
    start_urls = ['https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.']
    download_delay = 1.5
    def parse(self,response):
        for url in response.css('h3.search-result-title a ::attr(href)').extract():
        url=response.urljoin(url)
        yield scrapy.Request(url, callback=self.parse_article)

    #"load more result"

    job_count = 1970
    job_per_page = 10

    pages = math.ceil(job_count/job_per_page)

    for page in range(2,pages):

        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9,bn;q=0.8,af;q=0.7',
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
            'accept': '*/*',
            'referer': 'https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.',
            'authority': 'www.reuters.com',
            'cookie': '_ga=GA1.2.592162541.1518081459; _gid=GA1.2.1478931362.1518081459; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22e58b8e9e-8674-49b4-aaff-0248b6976654%22; _cb_ls=1; OX_plg=pm; __gads=ID=3c74f81d13d6c1b4:T=1518081460:S=ALNI_MZsx67ijryijAj2JcD2YXXZw20zIA; _cb=sjG2aCNHffBaLnBl; AAMC_reuters_0=REGION%7C3; aam_uuid=06971314173867630360429126725673522696; _cb_svref=null; D_DUID=334503eb-dac8-49cd-babd-02081b0b6d24; D_TOKEN=1.0:a25bacf1dbb943e3ba1e93edb2093843:9841e8a348072081c4b770cfdd017d59831a31e6d41f368c89065cd08eec79bb34c9020669a0d8cbd7a670e4e11de2e762b5f67038115c02ba5fcbd9da8de4078116daf500471d1d6440734c181cb49859090467365cbf9d646c0d3fc7e7bb7e4e2643ea7a20bf00f9a695f9bf30b0df402746b31e429526a87ed7aa3c9da9bb:4b5290392fda7a6ff1f0f529cfad0d027a406ae35b6edb8e7cd3f6493ca8b99d; OX_sd=2; mnet_session_depth=2%7C1518104359854; _chartbeat2=.1518081466539.1518104385876.1.k_ivd8UuDjDegChcDsjhRBbcy9U',
        }

        data = {'blob':'National Health Investors, Inc.',
         'bigOrSmall':'big',
         'articleWithBlog':'true',
         'sortBy':"",
         'dateRange':"",
         'numResultsToShow':'10',
         'pn':str(page),
         'callback':'addMoreNewsResults'}


        url ='https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=10&pn={}&callback=addMoreNewsResults'.format(page)

        yield scrapy.FormRequest(url,
            headers=headers,callback=self.parse
            )

def parse_article(self, response):
    print('\n')
    print('***Heading:***',response.css('h1.ArticleHeader_headline_2zdFM ::text').extract_first())
    print('***Url-Link:***',response.url)
    print('***Date :***',response.css('div.ArticleHeader_date_V9eGk ::text').extract())
    print('\n')

1 个答案:

答案 0 :(得分:1)

每次点击“LOAD MORE RESULTS”都会返回带有JSON对象的Javascript响应:

if (typeof addMoreNewsResults == 'function') { 
addMoreNewsResults( {
    blob: 'National+Health+Investors%2C+Inc.',
    sortBy: 'relevance',
    dateRange: 'all',
    totalResultNumber: 1970,
    totalResultNumberStr: "1,970",
    news: [ 
        {
        id: "-pbm-push-idUSKBN1DG2CP",
        headline: "Diplomat Pharmacy plunges as <b>investors<\/b> fret over rapid PBM push",
        date: "November 16, 2017 11:22am EST",
        href: "/article/us-diplomat-stocks/diplomat-pharmacy-plunges-as-investors-fret-over-rapid-pbm-push-idUSKBN1DG2CP",
        blurb: "...(Reuters) - Shares of Diplomat Pharmacy <b>Inc<\/b> &lt;DPLO.N&gt; tumbled 20... <b>National<\/b> Pharmaceutical Services.\nSome analysts were not excited...",
        mainPicUrl: ""
        }, 
        {....

因此,您需要使用不同的解析机制来获取所需的信息(import jsonjson.loads()等)

有很多简单的方法。您可以在一个请求中获取所有内容(只需更改numResultsToShow参数即可获得所有内容): https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults

<强>更新

# -*- coding: utf-8 -*-

import scrapy
import re
import json

class ReutersSpider(scrapy.Spider):
    name = "reuters"
    start_urls = [
        'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults',
    ]

    def parse(self, response):

        json_string = re.search( r'addMoreNewsResults\((.+?) \);', response.body, re.DOTALL ).group(1)

        #Below code is used to transform from Javascript-ish JSON-like structure to JSON
        json_string = re.sub( r'^\s*(\w+):', r'"\1":', json_string, flags=re.MULTILINE)
        json_string = re.sub( r'(\w+),\s*$', r'"\1",', json_string, flags=re.MULTILINE)
        json_string = re.sub( r':\s*\'(.+?)\',\s*$', r': "\1",', json_string, flags=re.MULTILINE)

        results = json.loads(json_string)

        for result in results["news"]:
            item = {}
            item["href"] = result["href"]
            item["date"] = result["date"]

            yield item