将嵌入式JSON转换为平面表 - 提高效率

时间:2016-06-14 10:08:08

标签: python json csv

我是新的Python用户,他决定使用Python创建简单的应用程序,允许将json文件转换为平面表并以cvs格式保存输出。我想知道你是否可以给我一些建议如何改进我的代码以使其以更有效的方式工作。我问,因为如果我转换相对较小的文件一切正常,但当我尝试转换~200 MB文件时,它开始需要一段时间。我担心当我开始使用更大的文件时,转换我的数据集可能需要相当长的时间。

这是我的代码,我是在this关于展平json对象的博客文章的帮助下创建的:

import sys, os, json, tkFileDialog, tkMessageBox
from Tkinter import *
from pandas.io.json import json_normalize

def openFile():
    currdir = os.getcwd()
    filename = tkFileDialog.askopenfilename(
        initialdir = currdir,
        title='Please select a file',
        filetypes=[('JSON file','.json')])

    return filename

def loading_file(path):
    #File path
    file_path = path

    #Loading json file
    json_data = open(file_path)
    data = json.load(json_data)
    return data

#Function that recursively extracts values out of the object into a flattened dictionary
def flatten_json(data):
    flat = [] #list of flat dictionaries
    def flatten(y):
        out = {}

        def flatten2(x, name=''):
            if type(x) is dict:
                for a in x:
                    if a == "name":
                            flatten2(x["value"], name + x[a] + '_')
                    else:
                        flatten2(x[a], name + a + '_')
            elif type(x) is list:
                for a in x:
                    flatten2(a, name + '_')
            else:
                out[name[:-1]] = x

        flatten2(y)
        return out

#Loop needed to flatten multiple objects
    for i in range(len(data)):
        flat.append(flatten(data[i]).copy())

    return json_normalize(flat)




#Outputing normalized data into csv
def csv_out(data, path):
    #creating csv file name
    name = '~/Desktop/' + os.path.basename(os.path.splitext(path)[0]) + '.csv'
    #converting to the csv
    data.to_csv(name, encoding='utf-8') #'~/Desktop/out.csv'

def done():
   tkMessageBox.showinfo('json2csv',"DONE!")

def main():
    filepath = openFile()
    data_file = loading_file(filepath)
    table = flatten_json(data_file)
    csv_out(table, filepath)
    done()

### Application Interface ###
tk = Tk()

#Creating window:
tk.geometry('250x150+600+300')
tk.title('JSON2CSV')

#Creating convert button
convertbutton = Button(tk, text = 'Convert to .csv', command = main)
convertbutton.place(x = 25, y = 50)



tk.mainloop()

在这里,您将找到我使用的json结构的简短和简单示例:

[{
 "_id": {
   "id": "123"
 },
 "device": {
   "browser": "Safari",
   "category": "d",
   "os": "Mac"
 },
 "exID": {
   "$oid": "123"
 },
 "extreme": false,
 "geo": {
   "city": "London",
   "country": "United Kingdom",
   "countryCode": "UK",
   "ip": "00.000.000.0"
 },
 "viewed": {
   "$date": "2011-02-12"
 },
 "attributes": [{
   "name": "gender",
   "numeric": 0,
   "value": 0
 }, {
   "name": "email",
   "value": false
 }],
 "change": [{
   "id": {
     "$id": "1231"
   },
   "seen": [{
     "$date": "2011-02-12"
   }]
 }]
}, {
 "_id": {
   "id": "456"
 },
 "device": {
   "browser": "Chrome 47",
   "category": "d",
   "os": "Windows"
 },
 "exID": {
   "$oid": "345"
 },
 "extreme": false,
 "geo": {
   "city": "Berlin",
   "country": "Germany",
   "countryCode": "DE",
   "ip": "00.000.000.0"
 },
 "viewed": {
   "$date": "2011-05-12"
 },
 "attributes": [{
   "name": "gender",
   "numeric": 1,
   "value": 1
 }, {
   "name": "email",
   "value": true
 }],
 "change": [{
   "id": {
     "$id": "1231"
   },
   "seen": [{
     "$date": "2011-02-12"
   }]
 }]
}]

1 个答案:

答案 0 :(得分:0)

@ machine-yearning是对的。尽量避免嵌套 - 这很快就会变得难看。

这是我试图帮助你的尝试:

import json
import csv

def get_ids(data):
    ids = []
    for datum in data: 
        id = datum["_id"]["id"]
        ids.append(id)
    return ids

def get_devices(data):
    devices = []

    for datum in data:
        browser = datum["device"]["browser"]
        category = datum["device"]["category"]        
        os = datum["device"]["os"]
        devices.append([browser, category, os])
    return devices

def flatten_json(json_file, output_fn):
    data = json.loads(json_file)
    flattened_data =[]

    ids = get_ids(data)
    devices = get_devices(data)

    for id, device in zip(ids, devices):
        browser, category, os = device
        flattened_data.append([id, browser,category,os])
    with open(output_fn, 'ab') as my_csv:
        csv_file = csv.writer(my_csv, delimiter=',')
        csv_file.writerows(flattened_data)  
    return "FLATTENED DATA SAVED"

# example to make this run:
flatten_json(x, "my_flattened_file.csv")

您当然需要创建一个从网站或存储文件中获取json数据的函数。查看您的数据示例,您可以创建一个与 get_devices 类似的地理和属性函数,然后像我在 flatten_json 函数中那样包含它们,就像我在其他人。

希望这有帮助!