我编写此代码来过滤具有“ json”结构的bz2文件,并将过滤后的数据写入新文件。我试图通过使用例如库ujson
而不是json
来提高脚本的速度。
还有哪些其他方法可以提高性能?
import bz2
import ujson
import json
from pandas.io.json import json_normalize
filenameInput = "comJson.bz2"
filenameOutput = "com.json"
filterList = ["id10",
"id11",
"id12"]
with bz2.open(filenameInput, "rt") as bzinput:
with open(filenameOutput, "x") as jsonOutput:
lines = []
#reading bz2 one at a time
for i, line in enumerate(bzinput):
line_json = ujson.loads(line)
df = json_normalize(line_json)
#filter data by if column "id" matches id in list "filterList"
if df.id[0] in filterList:
df_subset = df[['id', 'classA', 'classB', 'classC']]
json_string = json.dumps({'id': df_subset.id[0],
'classA': df_subset.classA[0],
'classB': df_subset.classB[0],
'classC': df_subset.classC[0]},
indent=None)
jsonOutput.write(json_string + '\n')
函数消耗的时间:
#runtime: 0.002026081085205078s
line_json = ujson.loads(line)
df = json_normalize(line_json)
#runtime: 0.0s
if df.id[0] in filterList:
#runtime: 0.001997304916381836s
df_subset = df[['id', 'classA', 'classB', 'classC']]
#runtime: 0.002999544143676758s
json_string = json.dumps({'id': df_subset.id[0],
'classA': df_subset.classA[0],
'classB': df_subset.classB[0],
'classC': df_subset.classC[0]},
indent=None)
#runtime: 0.006982088088989258s
jsonOutput.write(json_string + '\n')