我试图编写下面的scrapy脚本来搜索下面的网站上的项目。 我能够废弃第一页的项目,但有更多关于2000页我想要废弃所有。 有一个选项“加载更多的结果”,我也试图废弃加载更多结果的页面,但无法做到这一点。 请帮我。
from scrapy.shell import open_in_browser
import scrapy
from scrapy import Selector
import math
import json
class MyItems(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
class ProductSpider(scrapy.Spider):
name= 'reuters'
allowed_domains = ['reuters.com']
start_urls = ['https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.']
download_delay = 1.5
def parse(self,response):
for url in response.css('h3.search-result-title a ::attr(href)').extract():
url=response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_article)
#"load more result"
job_count = 1970
job_per_page = 10
pages = math.ceil(job_count/job_per_page)
for page in range(2,pages):
headers = {
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8,af;q=0.7',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
'accept': '*/*',
'referer': 'https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.',
'authority': 'www.reuters.com',
'cookie': '_ga=GA1.2.592162541.1518081459; _gid=GA1.2.1478931362.1518081459; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22e58b8e9e-8674-49b4-aaff-0248b6976654%22; _cb_ls=1; OX_plg=pm; __gads=ID=3c74f81d13d6c1b4:T=1518081460:S=ALNI_MZsx67ijryijAj2JcD2YXXZw20zIA; _cb=sjG2aCNHffBaLnBl; AAMC_reuters_0=REGION%7C3; aam_uuid=06971314173867630360429126725673522696; _cb_svref=null; D_DUID=334503eb-dac8-49cd-babd-02081b0b6d24; D_TOKEN=1.0:a25bacf1dbb943e3ba1e93edb2093843:9841e8a348072081c4b770cfdd017d59831a31e6d41f368c89065cd08eec79bb34c9020669a0d8cbd7a670e4e11de2e762b5f67038115c02ba5fcbd9da8de4078116daf500471d1d6440734c181cb49859090467365cbf9d646c0d3fc7e7bb7e4e2643ea7a20bf00f9a695f9bf30b0df402746b31e429526a87ed7aa3c9da9bb:4b5290392fda7a6ff1f0f529cfad0d027a406ae35b6edb8e7cd3f6493ca8b99d; OX_sd=2; mnet_session_depth=2%7C1518104359854; _chartbeat2=.1518081466539.1518104385876.1.k_ivd8UuDjDegChcDsjhRBbcy9U',
}
data = {'blob':'National Health Investors, Inc.',
'bigOrSmall':'big',
'articleWithBlog':'true',
'sortBy':"",
'dateRange':"",
'numResultsToShow':'10',
'pn':str(page),
'callback':'addMoreNewsResults'}
url ='https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=10&pn={}&callback=addMoreNewsResults'.format(page)
yield scrapy.FormRequest(url,
headers=headers,callback=self.parse
)
def parse_article(self, response):
print('\n')
print('***Heading:***',response.css('h1.ArticleHeader_headline_2zdFM ::text').extract_first())
print('***Url-Link:***',response.url)
print('***Date :***',response.css('div.ArticleHeader_date_V9eGk ::text').extract())
print('\n')
答案 0 :(得分:1)
每次点击“LOAD MORE RESULTS”都会返回带有JSON对象的Javascript响应:
if (typeof addMoreNewsResults == 'function') {
addMoreNewsResults( {
blob: 'National+Health+Investors%2C+Inc.',
sortBy: 'relevance',
dateRange: 'all',
totalResultNumber: 1970,
totalResultNumberStr: "1,970",
news: [
{
id: "-pbm-push-idUSKBN1DG2CP",
headline: "Diplomat Pharmacy plunges as <b>investors<\/b> fret over rapid PBM push",
date: "November 16, 2017 11:22am EST",
href: "/article/us-diplomat-stocks/diplomat-pharmacy-plunges-as-investors-fret-over-rapid-pbm-push-idUSKBN1DG2CP",
blurb: "...(Reuters) - Shares of Diplomat Pharmacy <b>Inc<\/b> <DPLO.N> tumbled 20... <b>National<\/b> Pharmaceutical Services.\nSome analysts were not excited...",
mainPicUrl: ""
},
{....
因此,您需要使用不同的解析机制来获取所需的信息(import json
,json.loads()
等)
有很多简单的方法。您可以在一个请求中获取所有内容(只需更改numResultsToShow
参数即可获得所有内容):
https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults
<强>更新强>
# -*- coding: utf-8 -*-
import scrapy
import re
import json
class ReutersSpider(scrapy.Spider):
name = "reuters"
start_urls = [
'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults',
]
def parse(self, response):
json_string = re.search( r'addMoreNewsResults\((.+?) \);', response.body, re.DOTALL ).group(1)
#Below code is used to transform from Javascript-ish JSON-like structure to JSON
json_string = re.sub( r'^\s*(\w+):', r'"\1":', json_string, flags=re.MULTILINE)
json_string = re.sub( r'(\w+),\s*$', r'"\1",', json_string, flags=re.MULTILINE)
json_string = re.sub( r':\s*\'(.+?)\',\s*$', r': "\1",', json_string, flags=re.MULTILINE)
results = json.loads(json_string)
for result in results["news"]:
item = {}
item["href"] = result["href"]
item["date"] = result["date"]
yield item