Question

我正在尝试抓取与COVID-19相关的数据。我可以从网站上下载一些数据，例如总案例数，但不能下载交互式图表中的数据。

我通常通过在inspect元素页面的'network'中找到源来使用json抓取交互式图形。但是，我无法找到要刮擦的交互式图形的“网络”。

有人可以帮我从“总死亡人数”图中抓取数据吗？或网站上的任何其他图形。谢谢。

只是要弄清楚。我不想从“国家”表中抓取数据。我已经做到了。我想做的就是从图表中获取数据。例如，来自死亡率比率图与日期或活动病例图与时间日期图的数据。

谢谢

import requests
import urllib.request
import time
import json
from bs4 import BeautifulSoup 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

url= 'https://www.worldometers.info/coronavirus/#countries'
r = requests.get(url)
soup= BeautifulSoup(r.text, "html.parser")

例如，受影响的国家数：

len(soup.find_all('table',{'id':'main_table_countries'})[0].find('tbody').find_all('tr'))

website

Answer 1

正如我在评论中提到的，带有Highcharts.chart(....)的JavaScript很少，因此我尝试使用不同的方法获取值。

大多数人都需要在数据中手动查找元素，并创建正确的索引或xpath以获得值。所以这并不容易。

最简单的方法是使用我在@AyushGarg链接Can I scrape the raw data from highcharts.js?中看到的js2xml

最困难的是使用pyjsparser

import requests
from bs4 import BeautifulSoup 
import json
#import dirtyjson
import pyjsparser
import js2xml

url= 'https://www.worldometers.info/coronavirus/#countries'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

scripts = soup.find_all('script')

script = scripts[24].text
print(script)

print('\n--- pyjsparser ---\n')
data = pyjsparser.parse(script) 
data = data['body'][0]['expression']['arguments'][1]['properties'][-2]['value']['elements'][0]['properties'][-1]['value']['elements'] # a lot of work to find it
#print(json.dumps(data, indent=2))
data = [x['value'] for x in data]
print(data)
# text values
# it needs work

print('\n--- eval ---\n')
data = script.split('data: [', 1)[1].split(']', 1)[0]
data = eval(data)  # it creates tuple
print(data)
# text values
data = script.split("title: { text: '", 1)[-1].split("'", 1)[0]
print(data)
data = script.split("title: { text: '", 3)[-1].split("'", 1)[0]
print(data)

print('\n--- json ---\n')
data = script.split('data: [', 1)[1].split(']', 1)[0]
data = '[' + data + ']' # create correct JSON data
data = json.loads(data) # this time doesn't need `dirtyjson`
print(data)
# text values
data = script.split("title: { text: '", 1)[-1].split("'", 1)[0]
print(data)
data = script.split("title: { text: '", 3)[-1].split("'", 1)[0]
print(data)

print('\n--- js2xml ---\n')
data = js2xml.parse(script)
print(data.xpath('//property[@name="data"]//number/@value')) # nice and short xpath
# text values
#print(js2xml.pretty_print(data.xpath('//property[@name="title"]')[0]))
text = data.xpath('//property[@name="title"]//string/text()')
print(text[0])
print(text[1])

结果

 Highcharts.chart('coronavirus-deaths-linear', { chart: { type: 'line' }, title: { text: 'Total Deaths' }, subtitle: { text: '(Linear Scale)' }, xAxis: { categories: ["Jan 22","Jan 23","Jan 24","Jan 25","Jan 26","Jan 27","Jan 28","Jan 29","Jan 30","Jan 31","Feb 01","Feb 02","Feb 03","Feb 04","Feb 05","Feb 06","Feb 07","Feb 08","Feb 09","Feb 10","Feb 11","Feb 12","Feb 13","Feb 14","Feb 15","Feb 16","Feb 17","Feb 18","Feb 19","Feb 20","Feb 21","Feb 22","Feb 23","Feb 24","Feb 25","Feb 26","Feb 27","Feb 28","Feb 29","Mar 01","Mar 02","Mar 03","Mar 04","Mar 05"] }, yAxis: { title: { text: 'Total Coronavirus Deaths' } }, legend: { layout: 'vertical', align: 'right', verticalAlign: 'middle' }, credits: { enabled: false }, series: [{ name: 'Deaths', color: '#FF9900', lineWidth: 5, data: [17,25,41,56,80,106,132,170,213,259,304,362,426,492,565,638,724,813,910,1018,1115,1261,1383,1526,1669,1775,1873,2009,2126,2247,2360,2460,2618,2699,2763,2800,2858,2923,2977,3050,3117,3202,3285,3387] }], responsive: { rules: [{ condition: { maxWidth: 800 }, chartOptions: { legend: { layout: 'horizontal', align: 'center', verticalAlign: 'bottom' } } }] } }); 

--- pyjsparser ---

[17.0, 25.0, 41.0, 56.0, 80.0, 106.0, 132.0, 170.0, 213.0, 259.0, 304.0, 362.0, 426.0, 492.0, 565.0, 638.0, 724.0, 813.0, 910.0, 1018.0, 1115.0, 1261.0, 1383.0, 1526.0, 1669.0, 1775.0, 1873.0, 2009.0, 2126.0, 2247.0, 2360.0, 2460.0, 2618.0, 2699.0, 2763.0, 2800.0, 2858.0, 2923.0, 2977.0, 3050.0, 3117.0, 3202.0, 3285.0, 3387.0]

--- eval ---

(17, 25, 41, 56, 80, 106, 132, 170, 213, 259, 304, 362, 426, 492, 565, 638, 724, 813, 910, 1018, 1115, 1261, 1383, 1526, 1669, 1775, 1873, 2009, 2126, 2247, 2360, 2460, 2618, 2699, 2763, 2800, 2858, 2923, 2977, 3050, 3117, 3202, 3285, 3387)
Total Deaths
Total Coronavirus Deaths

--- json ---

[17, 25, 41, 56, 80, 106, 132, 170, 213, 259, 304, 362, 426, 492, 565, 638, 724, 813, 910, 1018, 1115, 1261, 1383, 1526, 1669, 1775, 1873, 2009, 2126, 2247, 2360, 2460, 2618, 2699, 2763, 2800, 2858, 2923, 2977, 3050, 3117, 3202, 3285, 3387]
Total Deaths
Total Coronavirus Deaths

--- js2xml ---

['17', '25', '41', '56', '80', '106', '132', '170', '213', '259', '304', '362', '426', '492', '565', '638', '724', '813', '910', '1018', '1115', '1261', '1383', '1526', '1669', '1775', '1873', '2009', '2126', '2247', '2360', '2460', '2618', '2699', '2763', '2800', '2858', '2923', '2977', '3050', '3117', '3202', '3285', '3387']
Total Deaths
Total Coronavirus Deaths

编辑：页面的结构和代码也需要进行一些更改

import requests
from bs4 import BeautifulSoup 
import json
#import dirtyjson
import js2xml
import pyjsparser

# --- functions ---

def test_eval(script):
    print('\n--- eval ---\n')

    # chart values
    text = script.split('data: [', 1)[1] # beginning
    text = text.split(']', 1)[0] # end
    values = eval(text)  # it creates tuple
    print(values)

    # title 
    # I split `yAxis` because there is other `title` without text
    # I split beginning in few steps because text may have different indentations (different number of spaces)
    # (you could use regex to split in one step)
    text = script.split("title: {\n", 1)[1] # beginning
    text = text.split("text: '", 1)[1] # beginning
    title = text.split("'", 1)[0] # end
    print('\ntitle:', title)

    text = script.split("yAxis: {\n", 1)[1] # beginning
    text = text.split("title: {\n", 1)[1] # beginning
    text = text.split("text: '", 1)[1] # beginning
    title = text.split("'", 1)[0] # end
    print('\ntitle:', title)

def test_json(script):
    print('\n--- json ---\n')

    # chart values
    text = script.split('data: [', 1)[1] # beginning
    text = text.split(']', 1)[0] # end
    text = '[' + text + ']' # create correct JSON data
    values = json.loads(text) # this time doesn't need `dirtyjson`
    print(values)

    # title
    # I split `yAxis` because there is other `title` without text
    # I split beginning in few steps because text may have different indentations (different number of spaces)
    # (you could use regex to split in one step)
    text = script.split("title: {\n", 1)[1] # beginning
    text = text.split("text: '", 1)[1] # beginning
    title = text.split("'", 1)[0] # end
    print('\ntitle:', title)

    text = script.split("yAxis: {\n", 1)[1] # beginning
    text = text.split("title: {\n", 1)[1] # beginning
    text = text.split("text: '", 1)[1] # beginning
    title = text.split("'", 1)[0] # end
    print('\ntitle:', title)

def test_js2xml(script):
    print('\n--- js2xml ---\n')

    data = js2xml.parse(script)

    # chart values (short and nice path)
    values = data.xpath('//property[@name="data"]//number/@value')
    #values = [int(x) for x in values] # it may need to convert to int() or float()
    #values = [float(x) for x in values] # it may need to convert to int() or float()
    print(values)

    # title (short and nice path)
    #print(js2xml.pretty_print(data.xpath('//property[@name="title"]')[0]))
    #title = data.xpath('//property[@name="title"]//string/text()')
    #print(js2xml.pretty_print(data.xpath('//property[@name="yAxis"]//property[@name="title"]')[0]))

    title = data.xpath('//property[@name="title"]//string/text()')
    title = title[0]
    print('\ntitle:', title)

    title = data.xpath('//property[@name="yAxis"]//property[@name="title"]//string/text()')
    title = title[0]
    print('\ntitle:', title)

def test_pyjsparser(script):
    print('\n--- pyjsparser ---\n')

    data = pyjsparser.parse(script)

    print("body's number:", len(data['body']))

    for number, body in enumerate(data['body']):
        if (body['type'] == 'ExpressionStatement'
            and body['expression']['callee']['object']['name'] == 'Highcharts'
            and len(body['expression']['arguments']) > 1):

            arguments = body['expression']['arguments']
            #print(json.dumps(values, indent=2))
            for properties in arguments[1]['properties']:
                #print('name: >{}<'.format(p['key']['name']))
                if properties['key']['name'] == 'series':
                    values = properties['value']['elements'][0]
                    values = values['properties'][-1]
                    values = values['value']['elements'] # a lot of work to find it
                    #print(json.dumps(values, indent=2))

                    values = [x['value'] for x in values]
                    print(values)

    # title (very complicated path) 
    # It needs more work to find correct indexes to get title
    # so I skip this part as too complex.

# --- main ---

url= 'https://www.worldometers.info/coronavirus/#countries'

r = requests.get(url)
#print(r.text)
soup = BeautifulSoup(r.text, "html.parser")

all_scripts = soup.find_all('script')
print('number of scripts:', len(all_scripts))

for number, script in enumerate(all_scripts):

    #if 'data: [' in script.text:
    if 'Highcharts.chart' in script.text:
        print('\n=== script:', number, '===\n')
        test_eval(script.text)
        test_json(script.text)
        test_js2xml(script.text)
        test_pyjsparser(script.text)

我从我的博客Scraping: How to get data from interactive plot created with HighCharts

中获取了它

Answer 2

我使用了furas代码，并制作了一个版本以获取来自不同国家/地区的数据。

import requests
from bs4 import BeautifulSoup 
import json
import pyjsparser
import js2xml

def scrape(country):
    url = 'https://www.worldometers.info/coronavirus/country/' + country + '/'
    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    scripts = soup.find_all('script')
    counter = 0
    retlist = [ ]
    retlist.append(country);
    while counter < len(scripts):
        try:
            if js2xml.parse(scripts[counter].string).xpath('//property[@name="title"]//string/text()')[0] in ['Total Cases', 'Active Cases', 'Total Deaths']:
                retlist.append(js2xml.parse(scripts[counter].string).xpath('//property[@name="title"]//string/text()')[0])
                retlist = retlist + json.loads('[' + scripts[counter].string.split('data: [', 1)[1].split(']', 1)[0] + ']')
        except:
            pass
        counter = counter + 1
    return retlist

countries = ['us', 'spain', 'italy', 'france', 'germany', 'iran', 'china', 'uk', 'south-korea']
for country in countries:
    print(scrape(country))

Answer 3

这是我的看法：

首先，您必须以NumPy数组的形式获取表：

import requests
from bs4 import BeautifulSoup
import numpy as np

def convertDigit(string):
    if string.replace(",", "").isdigit():
        return int(string.replace(",", ""))
    return string

url = 'https://www.worldometers.info/coronavirus/#countries'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser") # Parse html

table = soup.find("table", {"id": "main_table_countries"}).find_all("tbody") # table
tr_elems = table[0].find_all("tr") # All rows in table

data = []
for tr in tr_elems: # Loop through rows
    td_elems = tr.find_all("td") # Each column in row
    data.append([convertDigit(td.text.strip()) for td in td_elems])

np_array = np.array(data)

现在，您的所有数据都在np_array内部。之后，将您的numpy数组转换为图形应该非常简单。

Web刮冠状病毒互动图

3 个答案: