Question

我一直致力于解析JSON文件，以便更容易在PostgreSQL中使用，并且想知道将JSON字典解析为元组的最佳方法是什么？

例如，两个变量的行看起来像这样：

第一个

"attributes": {"RestaurantsTableService": false, "GoodForMeal": {"dessert": false, "latenight": false, "lunch": false, "dinner": false, "breakfast": false, "brunch": false}, "Alcohol": "none", "Caters": true, "HasTV": false, "RestaurantsGoodForGroups": true, "NoiseLevel": "quiet", "WiFi": "no", "RestaurantsAttire": "casual", "RestaurantsReservations": false, "OutdoorSeating": false, "BusinessAcceptsCreditCards": true, "RestaurantsPriceRange2": 1, "BikeParking": true, "RestaurantsDelivery": false, "Ambience": {"romantic": false, "intimate": false, "classy": false, "hipster": false, "divey": false, "touristy": false, "trendy": false, "upscale": false, "casual": false}, "RestaurantsTakeOut": true, "GoodForKids": true, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": false, "valet": false}}

第二个：

"hours": {"Monday": "7:30-22:00", "Tuesday": "7:30-22:00", "Friday": "7:30-22:00", "Wednesday": "7:30-22:00", "Thursday": "7:30-22:00", "Sunday": "7:30-21:00", "Saturday": "7:30-22:00"}

我希望他们采用这种格式：

对于属性：

Attributes: [(RestaurantsTableService, False)(dessert, False)(latenight, False)(lunch, False)(dinner, False)(breakfast, False)(brunch, False)(Alcohol, none)(Caters, True)(HasTV, False)(RestaurantsGoodForGroups, True)(NoiseLevel, quiet)(WiFi, no)(RestaurantsAttire, casual)(RestaurantsReservations, False)(OutdoorSeating, False)(BusinessAcceptsCreditCards, True)(RestaurantsPriceRange2, 1)(BikeParking, True)(RestaurantsDelivery, False)(romantic, False)(intimate, False)(classy, False)(hipster, False)(divey, False)(touristy, False)(trendy, False)(upscale, False)(casual, False)(RestaurantsTakeOut, True)(GoodForKids, True)(garage, False)(street, False)(validated, False)(lot, False)(valet, False)]

小时：

Hours: [(Friday, 9:00,12:00)(Tuesday, 14:00,19:00)(Thursday, 14:00,19:00)(Wednesday, 14:30,17:00)(Monday, 14:30,17:00)]

到目前为止，这是我的代码。目前我一直在尝试访问变量中每个字典的值。我能够遍历它们但却无法访问布尔值，整数或字符串值。

import json
import ast
import pandas as pd
from datetime import datetime
from collections import OrderedDict, defaultdict

def cleanStr4SQL(s):
    return s.replace("'","`").replace("\n"," ")


def parseBusinessData():
    #read the JSON file
    with open('yelp_business.JSON','r') as f:  #Assumes that the data files are available in the current director. If not, you should set the path for the yelp data files.  
        outfile =  open('business.txt', 'w')
        line = f.readline()
        count_line = 0
        #read each JSON abject and extract data
        while line:
            data = json.loads(line)


#            jsondict = ast.literal_eval(str(data))
#            df = pd.DataFrame(jsondict['attributes'])
#            df['features'] = df.index.str.rjust(5, '0')
#            df['atts'] = df['features'].apply(attributes)
#            outfile.write(str([item for item in df['atts']]) + '\t')
#            df = df.apply(attributes)
#            print(str([item for item in data['attributes']]))
            outfile.write(str([k for k in [item for item in [l for l in data['attributes']]]]) + '\t') # write your own code to process attributes
            outfile.write(str([item for item in data['hours']]) + '\t') # write your own code to process hours
            outfile.write('\n');

            line = f.readline()
            count_line +=1
    print(count_line)
    outfile.close()
    f.close()

def attributes(val):
    if val == False:
        return 0
    if val == True:
        return 1

如果您有任何其他问题或疑虑，请与我们联系。任何建议都表示赞赏。

感谢您的阅读。

Answer 1

我理解这个问题，看起来你想要生成一个特定格式的字符串，扁平化嵌套字典中的属性。

这是你想要的吗？

import json

def flatten(D,key):
    L = []
    for k,v in D[key].items():
        if isinstance(v,dict):
            for kk,vv in v.items():
                L.append((kk,vv))
        else:
            L.append((k,v))
    return L

att_json = '{"attributes": {"RestaurantsTableService": false, "GoodForMeal": {"dessert": false, "latenight": false, "lunch": false, "dinner": false, "breakfast": false, "brunch": false}, "Alcohol": "none", "Caters": true, "HasTV": false, "RestaurantsGoodForGroups": true, "NoiseLevel": "quiet", "WiFi": "no", "RestaurantsAttire": "casual", "RestaurantsReservations": false, "OutdoorSeating": false, "BusinessAcceptsCreditCards": true, "RestaurantsPriceRange2": 1, "BikeParking": true, "RestaurantsDelivery": false, "Ambience": {"romantic": false, "intimate": false, "classy": false, "hipster": false, "divey": false, "touristy": false, "trendy": false, "upscale": false, "casual": false}, "RestaurantsTakeOut": true, "GoodForKids": true, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": false, "valet": false}}}'
att = json.loads(att_json)
att_list = flatten(att,'attributes')
s = 'Attributes: [' + ''.join(['({}, {})'.format(k,v) for k,v in att_list]) + ']'
print(s)

hours_json = '{"hours": {"Monday": "7:30-22:00", "Tuesday": "7:30-22:00", "Friday": "7:30-22:00", "Wednesday": "7:30-22:00", "Thursday": "7:30-22:00", "Sunday": "7:30-21:00", "Saturday": "7:30-22:00"}}'
hours = json.loads(hours_json)
hours_list = flatten(hours,'hours')
s = 'Hours: [' + ''.join(['({}, {})'.format(k,v.replace('-',',')) for k,v in hours_list]) + ']'
print(s)

输出：

Attributes: [(RestaurantsTableService, False)(dessert, False)(latenight, False)(lunch, False)(dinner, False)(breakfast, False)(brunch, False)(Alcohol, none)(Caters, True)(HasTV, False)(RestaurantsGoodForGroups, True)(NoiseLevel, quiet)(WiFi, no)(RestaurantsAttire, casual)(RestaurantsReservations, False)(OutdoorSeating, False)(BusinessAcceptsCreditCards, True)(RestaurantsPriceRange2, 1)(BikeParking, True)(RestaurantsDelivery, False)(romantic, False)(intimate, False)(classy, False)(hipster, False)(divey, False)(touristy, False)(trendy, False)(upscale, False)(casual, False)(RestaurantsTakeOut, True)(GoodForKids, True)(garage, False)(street, False)(validated, False)(lot, False)(valet, False)]
Hours: [(Monday, 7:30,22:00)(Tuesday, 7:30,22:00)(Friday, 7:30,22:00)(Wednesday, 7:30,22:00)(Thursday, 7:30,22:00)(Sunday, 7:30,21:00)(Saturday, 7:30,22:00)]

从JSON文件创建元组

1 个答案: