我是新的Python用户,他决定使用Python创建简单的应用程序,允许将json文件转换为平面表并以cvs格式保存输出。我想知道你是否可以给我一些建议如何改进我的代码以使其以更有效的方式工作。我问,因为如果我转换相对较小的文件一切正常,但当我尝试转换~200 MB文件时,它开始需要一段时间。我担心当我开始使用更大的文件时,转换我的数据集可能需要相当长的时间。
这是我的代码,我是在this关于展平json对象的博客文章的帮助下创建的:
import sys, os, json, tkFileDialog, tkMessageBox
from Tkinter import *
from pandas.io.json import json_normalize
def openFile():
currdir = os.getcwd()
filename = tkFileDialog.askopenfilename(
initialdir = currdir,
title='Please select a file',
filetypes=[('JSON file','.json')])
return filename
def loading_file(path):
#File path
file_path = path
#Loading json file
json_data = open(file_path)
data = json.load(json_data)
return data
#Function that recursively extracts values out of the object into a flattened dictionary
def flatten_json(data):
flat = [] #list of flat dictionaries
def flatten(y):
out = {}
def flatten2(x, name=''):
if type(x) is dict:
for a in x:
if a == "name":
flatten2(x["value"], name + x[a] + '_')
else:
flatten2(x[a], name + a + '_')
elif type(x) is list:
for a in x:
flatten2(a, name + '_')
else:
out[name[:-1]] = x
flatten2(y)
return out
#Loop needed to flatten multiple objects
for i in range(len(data)):
flat.append(flatten(data[i]).copy())
return json_normalize(flat)
#Outputing normalized data into csv
def csv_out(data, path):
#creating csv file name
name = '~/Desktop/' + os.path.basename(os.path.splitext(path)[0]) + '.csv'
#converting to the csv
data.to_csv(name, encoding='utf-8') #'~/Desktop/out.csv'
def done():
tkMessageBox.showinfo('json2csv',"DONE!")
def main():
filepath = openFile()
data_file = loading_file(filepath)
table = flatten_json(data_file)
csv_out(table, filepath)
done()
### Application Interface ###
tk = Tk()
#Creating window:
tk.geometry('250x150+600+300')
tk.title('JSON2CSV')
#Creating convert button
convertbutton = Button(tk, text = 'Convert to .csv', command = main)
convertbutton.place(x = 25, y = 50)
tk.mainloop()
在这里,您将找到我使用的json结构的简短和简单示例:
[{
"_id": {
"id": "123"
},
"device": {
"browser": "Safari",
"category": "d",
"os": "Mac"
},
"exID": {
"$oid": "123"
},
"extreme": false,
"geo": {
"city": "London",
"country": "United Kingdom",
"countryCode": "UK",
"ip": "00.000.000.0"
},
"viewed": {
"$date": "2011-02-12"
},
"attributes": [{
"name": "gender",
"numeric": 0,
"value": 0
}, {
"name": "email",
"value": false
}],
"change": [{
"id": {
"$id": "1231"
},
"seen": [{
"$date": "2011-02-12"
}]
}]
}, {
"_id": {
"id": "456"
},
"device": {
"browser": "Chrome 47",
"category": "d",
"os": "Windows"
},
"exID": {
"$oid": "345"
},
"extreme": false,
"geo": {
"city": "Berlin",
"country": "Germany",
"countryCode": "DE",
"ip": "00.000.000.0"
},
"viewed": {
"$date": "2011-05-12"
},
"attributes": [{
"name": "gender",
"numeric": 1,
"value": 1
}, {
"name": "email",
"value": true
}],
"change": [{
"id": {
"$id": "1231"
},
"seen": [{
"$date": "2011-02-12"
}]
}]
}]
答案 0 :(得分:0)
@ machine-yearning是对的。尽量避免嵌套 - 这很快就会变得难看。
这是我试图帮助你的尝试:
import json
import csv
def get_ids(data):
ids = []
for datum in data:
id = datum["_id"]["id"]
ids.append(id)
return ids
def get_devices(data):
devices = []
for datum in data:
browser = datum["device"]["browser"]
category = datum["device"]["category"]
os = datum["device"]["os"]
devices.append([browser, category, os])
return devices
def flatten_json(json_file, output_fn):
data = json.loads(json_file)
flattened_data =[]
ids = get_ids(data)
devices = get_devices(data)
for id, device in zip(ids, devices):
browser, category, os = device
flattened_data.append([id, browser,category,os])
with open(output_fn, 'ab') as my_csv:
csv_file = csv.writer(my_csv, delimiter=',')
csv_file.writerows(flattened_data)
return "FLATTENED DATA SAVED"
# example to make this run:
flatten_json(x, "my_flattened_file.csv")
您当然需要创建一个从网站或存储文件中获取json数据的函数。查看您的数据示例,您可以创建一个与 get_devices 类似的地理和属性函数,然后像我在 flatten_json 函数中那样包含它们,就像我在其他人。
希望这有帮助!