使用JSON +大熊猫将长格式的数据转换为宽格式

时间:2019-09-27 12:20:58

标签: python json pandas dictionary

我有24个月的历史数据。每个帐户在文件中仅出现一次(文件按月分隔)。 我想将帐户的所有数据(由于缺少更好的用词)合并到一行,即.to_dict在熊猫中的作用。

可复制的示例:

import pandas as pd
import json
import os
import sys

f1 = {'ACC_ID': [111, 222, 333, 444, 555, 666, 777, 888, 999, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000],
      'FI_DATE': ['2019-10-29', '2019-10-30', '2019-10-13', '2019-10-22', '2019-10-27', '2019-10-18', '2019-10-10', '2019-10-30', '2019-10-18', '2019-10-23', '2019-10-20', '2019-10-27', '2019-10-21', '2019-10-14', '2019-10-27', '2019-10-18', '2019-10-29', '2019-10-25', '2019-10-24', '2019-10-18', '2019-10-20', '2019-10-28', '2019-10-23', '2019-10-11', '2019-10-28', '2019-10-17', '2019-10-06', '2019-10-05', '2019-10-06', '2019-10-27'],
      'BAL': [80.71, -260.71, -178.19, 1914.93, 7795.57, 1224.39, 1414.28, 4185.31, 2689.11, 794.11, 488.91, 189, 544.14, 0, 169.51, 23.53, 1053.61, 519.95, 3225.8, 2116.2, 1241.7, 414.7, 129.06, 1150.23, -20, 0, 0, 1423.65, 1079.36, 558]
      }

f2 = {'ACC_ID': [111, 222, 333, 444, 666, 777, 888, 999, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000],
      'FI_DATE': ['2019-09-26', '2019-09-23', '2019-09-19', '2019-09-30', '2019-09-14', '2019-09-25', '2019-09-24', '2019-09-24', '2019-09-11', '2019-09-24', '2019-09-30', '2019-09-17', '2019-09-21', '2019-09-22', '2019-09-09', '2019-09-03', '2019-09-05', '2019-09-10', '2019-09-29', '2019-09-25', '2019-09-22', '2019-09-01', '2019-09-28', '2019-09-22', '2019-09-23', '2019-09-08', '2019-09-22', '2019-09-11', '2019-09-09'],
      'BAL': [2128, -5, 532, -123, 2853, 864, -50, 2204, 2177, 325, 1246, 2650, 1860, 2838, -50, 2515, 1631, 1477, 321, -30, 1374, -19, 2865, 2331, 2349, 2051, 901, 1218, 1218]
      }

f3 = {'ACC_ID': [111, 222, 333, 444, 555, 666, 777, 888, 999, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000],
      'FI_DATE': ['2018-08-28', '2018-08-08', '2018-08-22', '2018-08-11', '2018-08-26', '2018-08-10', '2018-08-05', '2018-08-17', '2018-08-24', '2018-08-02', '2018-08-04', '2018-08-17', '2018-08-30', '2018-08-28', '2018-08-10', '2018-08-21', '2018-08-14', '2018-08-11', '2018-08-02', '2018-08-30', '2018-08-13', '2018-08-25', '2018-08-24', '2018-08-20', '2018-08-29', '2018-08-17', '2018-08-16', '2018-08-03', '2018-08-24'],
      'BAL': [1638, 574, 237, 1201, 1393, 1999, 81, 1984, 45, 1029, 832, 1117, 780, 227, 30, 777, 1680, -380, 1667, 1926, -386, 1244, 141, -221, 1049, 1653, 1299, 804, 1133]
      }

df1 = pd.DataFrame(f1)
df1['FI_DATE'] = pd.to_datetime(df1['FI_DATE'])

df2 = pd.DataFrame(f2)
df2['FI_DATE'] = pd.to_datetime(df2['FI_DATE'])

df3 = pd.DataFrame(f3)
df3['FI_DATE'] = pd.to_datetime(df3['FI_DATE'])

chunkSize = 1000
delimiterTmp = ','
tmpDirName = 'C:/AJ/Projects/data/temp'
profileFields = ['BAL']

df1.to_csv(tmpDirName+'/d1.csv', index=True, index_label='_index_')
df2.to_csv(tmpDirName+'/d2.csv', index=True, index_label='_index_')
df3.to_csv(tmpDirName+'/d3.csv', index=True, index_label='_index_')

有一些预处理代码可以为tmpDirName / tmp /目录中的每个文件一个一个地创建data.csv。 在同一代码中,我为每个data.csv文件调用以下代码。

inputFileName = tmpDirName + '/tmp/data.csv'
outputFileName = tmpDirName + '/tmp/prof.csv'
tmpProfileStore = tmpDirName + '/tmp/tmpProfileStore.csv'
profileStorePath = tmpDirName + '/tmp/profileStore.csv'


def updateProfile(args):
    date1 = args['FI_DATE']
    profileJson = args['profileJson']

    if pd.isnull(date1):
        return None

    year = date1.year
    month = date1.month - 1

    if pd.isnull(profileJson):
        profile = {}
    else:
        profile = json.loads(profileJson)

    for fld in profileFields:
        if fld not in profile.keys():
            profile[fld] = {}

        if str(year) not in profile[fld].keys():
            profile[fld][str(year)] = [None] * 12

        if type(args[fld]) == pd.Timestamp:
            profile[fld][str(year)][month] = args[fld].strftime('%Y-%m-%d')
        else:
            profile[fld][str(year)][month] = args[fld]

    return json.dumps(profile)


def main():
    if os.path.isfile(outputFileName):
        os.remove(outputFileName)
    if os.path.isfile(tmpProfileStore):
        os.remove(tmpProfileStore)

    processedKeys = set()

    reader = pd.read_csv(inputFileName, sep=delimiterTmp, encoding='utf-8', header=0, chunksize=chunkSize, parse_dates=['FI_DATE'])
    for dfProf in reader:
        if os.path.exists(profileStorePath):
            readerStore = pd.read_csv(profileStorePath, sep=delimiterTmp, encoding='utf-8', header=0, chunksize=chunkSize, usecols=['ACC_ID', 'profileJson'])
            chunks = []
            for df in readerStore:
                chunks.append(df[df.ACC_ID.isin(dfProf.ACC_ID)])
            dfStore = pd.concat(chunks)
            dfProf = pd.merge(dfProf, dfStore, on='ACC_ID', how='left')
        else:
            dfProf['profileJson'] = None

        dfProf['_source_'] = dfProf[['FI_DATE', 'profileJson'] + profileFields].to_dict(orient='records')
        dfProf['profileJson'] = dfProf['_source_'].map(updateProfile)
        dfProf.drop(['_source_'], axis='columns', inplace=True)

        dfProf.to_csv(outputFileName, sep=delimiterTmp, encoding='utf-8', mode='a', header=not os.path.isfile(outputFileName))

        df = dfProf[['ACC_ID', 'profileJson']]
        df.to_csv(tmpProfileStore, sep=delimiterTmp, encoding='utf-8', mode='a', header=not os.path.isfile(tmpProfileStore), index=False)
        processedKeys.update(dfProf.ACC_ID)

    if os.path.exists(profileStorePath):
        readerStore = pd.read_csv(profileStorePath, sep=delimiterTmp, encoding='utf-8', header=0, chunksize=chunkSize, usecols=['ACC_ID', 'profileJson'])
        for df in readerStore:
            df[~df.ACC_ID.isin(processedKeys)].to_csv(tmpProfileStore, sep=delimiterTmp, encoding='utf-8', mode='a', header=not os.path.isfile(tmpProfileStore), index=False)

    if os.path.exists(profileStorePath):
        os.remove(profileStorePath)
    os.rename(tmpProfileStore, profileStorePath)


if __name__ == '__main__':
    main()

重现输出的手动步骤:

  1. 将d1.csv复制到temp / tmp文件夹中,并将其重命名为data.csv。
  2. 运行主要功能。
  3. 将d2.csv复制到temp / tmp文件夹中,并将其重命名为data.csv。
  4. 运行主要功能。
  5. 将d3.csv复制到temp / tmp文件夹中,并将其重命名为data.csv。
  6. 运行主要功能。

它将产生具有以下内容的文件profileStore.csv。

ACC_ID,profileJson
111,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2128, 80.71, null, null], ""2018"": [null, null, null, null, null, null, null, 1638, null, null, null, null]}}"
222,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, -5, -260.71, null, null], ""2018"": [null, null, null, null, null, null, null, 574, null, null, null, null]}}"
333,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 532, -178.19, null, null], ""2018"": [null, null, null, null, null, null, null, 237, null, null, null, null]}}"
444,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, -123, 1914.93, null, null], ""2018"": [null, null, null, null, null, null, null, 1201, null, null, null, null]}}"
555,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, null, 7795.57, null, null], ""2018"": [null, null, null, null, null, null, null, 1393, null, null, null, null]}}"
666,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2853, 1224.39, null, null], ""2018"": [null, null, null, null, null, null, null, 1999, null, null, null, null]}}"
777,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 864, 1414.28, null, null], ""2018"": [null, null, null, null, null, null, null, 81, null, null, null, null]}}"
888,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, -50, 4185.31, null, null], ""2018"": [null, null, null, null, null, null, null, 1984, null, null, null, null]}}"
999,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2204, 2689.11, null, null], ""2018"": [null, null, null, null, null, null, null, 45, null, null, null, null]}}"
1000,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2177, 794.11, null, null], ""2018"": [null, null, null, null, null, null, null, 1029, null, null, null, null]}}"
1100,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 325, 488.91, null, null], ""2018"": [null, null, null, null, null, null, null, 832, null, null, null, null]}}"
1200,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1246, 189.0, null, null], ""2018"": [null, null, null, null, null, null, null, 1117, null, null, null, null]}}"
1300,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2650, 544.14, null, null], ""2018"": [null, null, null, null, null, null, null, 780, null, null, null, null]}}"
1400,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1860, 0.0, null, null], ""2018"": [null, null, null, null, null, null, null, 227, null, null, null, null]}}"
1500,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2838, 169.51, null, null], ""2018"": [null, null, null, null, null, null, null, 30, null, null, null, null]}}"
1600,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, -50, 23.53, null, null], ""2018"": [null, null, null, null, null, null, null, 777, null, null, null, null]}}"
1700,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2515, 1053.61, null, null], ""2018"": [null, null, null, null, null, null, null, 1680, null, null, null, null]}}"
1800,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1631, 519.95, null, null], ""2018"": [null, null, null, null, null, null, null, -380, null, null, null, null]}}"
1900,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1477, 3225.8, null, null], ""2018"": [null, null, null, null, null, null, null, 1667, null, null, null, null]}}"
2000,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 321, 2116.2, null, null], ""2018"": [null, null, null, null, null, null, null, 1926, null, null, null, null]}}"
2200,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1374, 414.7, null, null], ""2018"": [null, null, null, null, null, null, null, -386, null, null, null, null]}}"
2300,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, -19, 129.06, null, null], ""2018"": [null, null, null, null, null, null, null, 1244, null, null, null, null]}}"
2400,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2865, 1150.23, null, null], ""2018"": [null, null, null, null, null, null, null, 141, null, null, null, null]}}"
2500,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2331, -20.0, null, null], ""2018"": [null, null, null, null, null, null, null, -221, null, null, null, null]}}"
2600,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2349, 0.0, null, null], ""2018"": [null, null, null, null, null, null, null, 1049, null, null, null, null]}}"
2700,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 2051, 0.0, null, null], ""2018"": [null, null, null, null, null, null, null, 1653, null, null, null, null]}}"
2800,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 901, 1423.65, null, null], ""2018"": [null, null, null, null, null, null, null, 1299, null, null, null, null]}}"
2900,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1218, 1079.36, null, null], ""2018"": [null, null, null, null, null, null, null, 804, null, null, null, null]}}"
3000,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, 1218, 558.0, null, null], ""2018"": [null, null, null, null, null, null, null, 1133, null, null, null, null]}}"
2100,"{""BAL"": {""2019"": [null, null, null, null, null, null, null, null, -30, 1241.7, null, null]}}"

一切正常,但运行缓慢。 profileFields大约有12列,每个月的数据行最多可以运行200万。

生成profileStore.csv大约需要两个小时。我希望在不更改输出文件结构的情况下尽可能减少时间。

欢迎任何建议。

0 个答案:

没有答案