如何遍历熊猫数据框以形成嵌套的json?

时间:2020-09-06 17:02:57

标签: python-3.x pandas

我有一个具有以下结构的熊猫数据框。可以使用以下代码创建

import pandas as pd
import numpy as np

word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]

df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start,  level_1_end )), 
               columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end']) 

enter image description here

我想将数据帧遍历为JSON。输出应如下所示:

{
 "vehicle": {
             "car":{
                    "tyre": True,
                    "steering": True
                    "clutch": True
           },
             "bus":{
                    "break": True
                   }
                },
"animal": {
           "dog":{
                  "leg": True
                 }
                }
}

在熊猫中实现这一目标的最佳方法是什么?

2 个答案:

答案 0 :(得分:0)

您正在捕获比所需更多的信息。不需要 end 列。

  1. 删除其中没有任何内容的行dropna()
  2. 向前填充标签,并从字符串中删除<>
  3. 使用理解力从数据框to_dict()构建字典
df = pd.DataFrame({"word":["this","is","a","test","call","this","is","a","test","call","this","is","a","test","call"],
              "level_3_start":["","","<tyre>","<steering>","","","","","<leg>","","<clutch>","","","<break>",""],
              "level_3_end":["","","</tyre>","","</steering>","","","","</leg>","","","","</clutch>","</break>",""],
              "level_2_start":["","","<car>","","","","","","<dog>","","<car>","","","<bus>",""],
              "level_2_end":["","","","","</car>","","","","</dog>","","","","</car>","</bus>",""],
              "level_1_start":["","","<vehicle>","","","","","","<animal>","","<vehicle>","","","",""],
              "level_1_end":["","","","","","","</vehicle>","","</animal>","","","","","</vehicle>",""]})

# cleanup
df = df.replace({"":np.nan}).dropna(subset=[c for c in df.columns if c!="word"], how="all")
for c in [c for c in df.columns if "start" in c]:
    df[c].fillna(method="ffill", inplace=True)
    df[c] = df[c].str.replace("<","")
    df[c] = df[c].str.replace(">","")


dfd = df.loc[:,[c for c in df.columns if "level" in c]].drop_duplicates().to_dict(orient="records")
{d["level_1_start"]:
 {d2["level_2_start"]:
  {d3["level_3_start"]:True
   for d3 in dfd if d3["level_1_start"]==d["level_1_start"] and d3["level_2_start"]==d2["level_2_start"]
  }
  for d2 in dfd if d2["level_1_start"]==d["level_1_start"]
 }
 for d in dfd
}

输出

{'vehicle': {'car': {'tyre': True, 'steering': True, 'clutch': True},
  'bus': {'break': True}},
 'animal': {'dog': {'leg': True}}}

答案 1 :(得分:0)

要获得最终结果,您的数据必须经过3个步骤:

步骤1:删除所有不需要处理的列

第2步:清除数据以删除标签,并按照level_1,level_2,level_3的顺序对其进行排序

第3步:创建嵌套字典

这就是我的做法。评论了每个部分,以清楚地表明我们在做什么。

import pandas as pd
import numpy as np
import collections

word = ['this','is','a','test','call','this','is','a','test','call','this','is ','a','test','call', np.NaN]
level_3_start = [np.NaN,np.NaN,'<tyre>','<steering>',np.NaN,np.NaN,np.NaN,np.NaN,'<leg>',np.NaN,'<clutch>',np.NaN,np.NaN,'<break>',np.NaN]
level_3_end = [np.NaN,np.NaN,'</tyre>',np.NaN,'</steering>',np.NaN,np.NaN,np.NaN,'</leg>',np.NaN,np.NaN,np.NaN,'</clutch>','</break>',np.NaN]
level_2_start = [np.NaN,np.NaN,'<car>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<dog>',np.NaN,'<car>',np.NaN,np.NaN,'<bus>',np.NaN]
level_2_end = [np.NaN,np.NaN,np.NaN,np.NaN,'</car>',np.NaN,np.NaN,np.NaN,'</dog>',np.NaN,np.NaN,np.NaN,'</car>','</bus>',np.NaN]
level_1_start= [np.NaN,np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'<animal>',np.NaN,'<vehicle>',np.NaN,np.NaN,np.NaN,np.NaN]
level_1_end= [np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN,'</animal>',np.NaN,np.NaN,np.NaN,np.NaN,'</vehicle>',np.NaN]

df1 = pd.DataFrame(list(zip(word, level_3_start,level_3_end, level_2_start,level_2_end, level_1_start,  level_1_end )), 
               columns =['word', 'level_3_start', 'level_3_end', 'level_2_start', 'level_2_end', 'level_1_start', 'level_1_end'])

#creating df_temp for processing
df_temp = df1

#drop columns that are not important for this problem statement
df_temp = df_temp.drop(columns=['word','level_1_end','level_2_end','level_3_end'])

#remove all < and >
df_temp['level_1_start'] = df_temp['level_1_start'].str.replace("<","").str.replace(">","")
df_temp['level_2_start'] = df_temp['level_2_start'].str.replace("<","").str.replace(">","")
df_temp['level_3_start'] = df_temp['level_3_start'].str.replace("<","").str.replace(">","")

#drop all rows that don't have any value
df_temp.dropna(how='all', inplace = True)

#forwardfill all level_1 columns
df_temp['level_1_start'] = df_temp['level_1_start'].ffill()

#drop rows that have no data in level_2 and level_3
df_temp = df_temp.dropna(subset=['level_3_start','level_2_start'],how='all')

#forwardfill all level_2_start columns
df_temp['level_2_start'] = df_temp['level_2_start'].ffill()

#drop rows that have no data in level_3
df_temp = df_temp.dropna(subset=['level_3_start'],how='all')

#now we have the all data ready for processing
#sort them in level_1, level_2, level_3 order
df_temp = df_temp.sort_values(by=['level_1_start', 'level_2_start','level_3_start'])

#to create nested dictionary, you need to use collections.defaultdict
df_dict = collections.defaultdict(dict)

#iterate through the dataframe. each row will have a unique record for level_3    
for idx,row in df_temp.iterrows():
    lev_1 = row['level_1_start']
    lev_2 = row['level_2_start']
    lev_3 = row['level_3_start']
    
    #if level_1 does not exist, create new entry for level_1, level_2, & level_3 (ex: animal does not exist)
    #if level_1 exists but no level_2, create new entry for level_2 & level_3 (ex: car does not exist but bus exists)
    #if level_1 and level 2 exists, then create a new entry for level 3 (ex: vehicle, car exists, but tyre does not)

    if lev_1 in df_dict:
        if lev_2 in df_dict[lev_1]:
            df_dict[lev_1][lev_2][lev_3] = True
        else:
            df_dict[lev_1][lev_2] = {lev_3:True}
    else:
        df_dict[lev_1] = {lev_2 : {lev_3:True}}

#convert collection back to normal dictionary
df_dict = dict(df_dict)

print(df_dict)

输出如下:

{'animal': 
    {'dog': {'leg': True}
    }, 
 'vehicle': 
    {'bus': {'break': True}, 
     'car': {'clutch': True, 'steering': True, 'tyre': True}
    }
}