有什么方法可以分段读取JSON文件吗?

时间:2020-05-29 01:15:24

标签: python sql json ijson

我正在尝试从 web 读取一些 JSON 文件,并使用数据创建一个 SQL 数据库。我正在使用 ijson 读取数据流。但是,当代码失败时,我需要重新开始以获取数据。 有什么方法可以从程序失败的地方继续读取JSON文件吗?

我可以使用json.loads读取整个文档,但是我假设数据太大而无法一次读取。

您可以在下面看到我的代码。

import sqlite3
import ssl
import urllib.request
import json
import ijson
import time

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

conn = sqlite3.connect('rawdata.sqlite')
cur = conn.cursor()

cur.execute('''DROP TABLE IF EXISTS DailyData ''')
cur.execute('''DROP TABLE IF EXISTS Countries ''')
cur.execute('''DROP TABLE IF EXISTS Continents ''')


cur.execute('''CREATE TABLE IF NOT EXISTS DailyData
    (id INTEGER, Day TEXT, Month TEXT, Year TEXT, country_id INTEGER, continent_id INTEGER, Cases TEXT, Deaths TEXT)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Countries
    (id INTEGER,     CountryCode TEXT UNIQUE, Country TEXT UNIQUE, Population TEXT, continent_id INTEGER)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Continents
    (id INTEGER, Continent TEXT UNIQUE)''')

url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
f = urllib.request.urlopen(url, context=ctx)
reports = ijson.items(f, 'records.item')

sum = 0
count = 0
# error = 0
for item in reports :

    iDataRep = item.get('dateRep')
    iCases = item.get('cases')
    iDeaths = item.get('deaths')
    iCountryCode = item.get('countryterritoryCode')
    iCountry = item.get('countriesAndTerritories')
    iPopulation = item.get('popData2018')
    iContinent = item.get('continentExp')

    if len(iDataRep) < 0: iDataRep = 0
    if len(iCases) < 0: iCases = 0
    if len(iDeaths) < 0: iDeaths = 0
    if len(iCountryCode) < 0: iCountryCode = 0
    if len(iCountry) < 0: iCountry = 0
    if len(iPopulation) < 0: iPopulation = 0
    if len(iContinent) < 0: iContinent = 0

    Spl = iDataRep.split('/')
    iDay = Spl[0]
    iMonth = Spl[1]
    iYear = Spl[2]
    id = count + 1

    cur.execute('''INSERT OR IGNORE INTO Continents (id, Continent)
        VALUES ( ?, ? )''', (id, iContinent))

    cur.execute('''SELECT id FROM Continents WHERE Continent = ? ''', (iContinent, ))
    continent_id = cur.fetchone()[0]

    cur.execute('''INSERT OR IGNORE INTO Countries (id, CountryCode, Country, Population, continent_id)
        VALUES ( ?, ?, ?, ?, ? )''', (id, iCountryCode, iCountry, iPopulation, continent_id) )

    cur.execute('''SELECT id FROM Countries WHERE Country = ? ''', (iCountry, ))
    country_id = cur.fetchone()[0]

    cur.execute('''INSERT OR IGNORE INTO DailyData (id, Day, Month, Year, country_id, continent_id, Cases, Deaths)
        VALUES ( ?, ?, ?, ?, ?, ?, ? ,?)''', (id, iDay, iMonth, iYear, country_id, continent_id, iCases, iDeaths) )

    conn.commit()

    # except:
    #     error = error + 1
    #     print(error)
    #     continue

    count = count + 1

    print(count, 'data retrieved...')

    if count % 95 == 0:
        time.sleep(1)
        print('Program slept a second.')

numCountry = cur.execute('SELECT max(id) FROM Countries' )
numContinent = cur.execute('SELECT max(id) FROM Continents' )

print('From', numCountry, 'different countries and', numContinent, 'continents', count, 'data retrieved.')

cur.close()

0 个答案:

没有答案