提高处理大数据(bz2文件)的速度

时间:2019-04-08 14:16:47

标签: python json pandas dataframe bz2

我编写此代码来过滤具有“ json”结构的bz2文件,并将过滤后的数据写入新文件。我试图通过使用例如库ujson而不是json来提高脚本的速度。

还有哪些其他方法可以提高性能?

import bz2
import ujson
import json
from pandas.io.json import json_normalize

filenameInput = "comJson.bz2"
filenameOutput = "com.json"

filterList = ["id10",
              "id11",
              "id12"]

with bz2.open(filenameInput, "rt") as bzinput:

    with open(filenameOutput, "x") as jsonOutput:
        lines = []

        #reading bz2 one at a time
        for i, line in enumerate(bzinput):

            line_json = ujson.loads(line)
            df = json_normalize(line_json)

            #filter data by if column "id" matches id in list "filterList"
            if df.id[0] in filterList:
                df_subset = df[['id', 'classA', 'classB', 'classC']]

                json_string = json.dumps({'id': df_subset.id[0],
                                          'classA': df_subset.classA[0],
                                          'classB': df_subset.classB[0],
                                          'classC': df_subset.classC[0]},
                                         indent=None)

                jsonOutput.write(json_string + '\n')

函数消耗的时间:

 #runtime:  0.002026081085205078s
 line_json = ujson.loads(line)
 df = json_normalize(line_json)


 #runtime:  0.0s 
 if df.id[0] in filterList:

 #runtime:  0.001997304916381836s 
 df_subset = df[['id', 'classA', 'classB', 'classC']]

 #runtime:  0.002999544143676758s
 json_string = json.dumps({'id': df_subset.id[0],
                                      'classA': df_subset.classA[0],
                                      'classB': df_subset.classB[0],
                                      'classC': df_subset.classC[0]},
                                     indent=None)

 #runtime:  0.006982088088989258s
 jsonOutput.write(json_string + '\n')

0 个答案:

没有答案