我有一个(不正确的)JSON文件,我想将其转换为CSV表格。 Bellow I显示JSON文件的两行(2500个中的2个):
{
"usage":{
"text_characters":7653,
"features":2,
"text_units":1
},
"emotion":{
"document":{
"emotion":{
"anger":0.085554,
"joy":0.526103,
"sadness":0.533085,
"fear":0.148549,
"disgust":0.078346
}
}
},
"language":"en",
"sentiment":{
"document":{
"score":-0.323271,
"label":"negative"
}
},
"retrieved_url":"http://blogs.plos.org/speakingofmedicine/2017/01/20/the-why-vaccines-dont-cause-autism-papers/"
}{
"usage":{
"text_characters":5528,
"features":2,
"text_units":1
},
"emotion":{
"document":{
"emotion":{
"anger":0.160801,
"joy":0.443317,
"sadness":0.596578,
"fear":0.555745,
"disgust":0.127581
}
}
},
"language":"en",
"sentiment":{
"document":{
"score":-0.558026,
"label":"negative"
}
},
"retrieved_url":"http://www.cnn.com/2011/HEALTH/01/05/autism.vaccines/index.html"
}
但是我想用python:
将它转换为这样的CSV表usage__text_characters usage__features usage__text_units emotion__document__emotion__anger emotion__document__emotion__joy emotion__document__emotion__sadness emotion__document__emotion__fear emotion__document__emotion__disgust language sentiment__document__score sentiment__document__label retrieved_url
7653 2 1 0.085554 0.526103 0.533085 0.148549 0.078346 en -0.323271 negative http://blogs.plos.org/speakingofmedicine/2017/01/20/the-why-vaccines-dont-cause-autism-papers/
5528 2 1 0.160801 0.443317 0.596578 0.555745 0.127581 en -0.558026 negative http://www.cnn.com/2011/HEALTH/01/05/autism.vaccines/index.html
我已经尝试了几个已经没有成功的想法(我合并了我已经尝试过的东西):
import json
import pandas as pd
with open('data.json') as data_file:
dd = json.load(data_file)
print dd
df = pd.read_json('data.json').unstack().dropna()
data = pd.read_json('data.json', lines=True)
with open('data.json', 'rb') as f:
data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
data_df = pd.read_json(data_json_str)
@JeffMercado的答案解决了问题
答案 0 :(得分:0)
远非最顺畅的解决方案,但您可以将数据作为更新的字典读取,然后再插入到数据框中。
import pandas as pd
from collections import OrderedDict
with open('data.json', 'r') as f:
data = f.read()
# correct the wrong encoded json string
data = "[" + ','.join(['{"usage'+e for e in data.split('{"usage') if e]) + "]"
data = json.loads(data)
rows_list = []
cols = []
for ind,row in enumerate(data):
# Only to get column names
if ind == 0:
cols.append(["usage__{}".format(i) for i in row["usage"].keys()])
cols.append(["emotion__document__emotion__{}".format(i) for i in row["emotion"]["document"]["emotion"].keys()])
cols.append(["sentiment__document__{}".format(i) for i in row["sentiment"]["document"].keys()])
cols.append(["retrieved_url"])
d = OrderedDict()
d.update(row["usage"])
d.update(row["emotion"]["document"]["emotion"])
d.update(row["sentiment"]["document"])
d.update({"retrieved_url":row["retrieved_url"]})
rows_list.append(d)
df = pd.DataFrame(rows_list)
df.columns = [i for subitem in cols for i in subitem]
#output without index
df.to_csv("output.csv", index=False)
<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>usage__text_characters</th> <th>usage__features</th> <th>usage__text_units</th> <th>emotion__document__emotion__anger</th> <th>emotion__document__emotion__joy</th> <th>emotion__document__emotion__sadness</th> <th>emotion__document__emotion__fear</th> <th>emotion__document__emotion__disgust</th> <th>sentiment__document__score</th> <th>sentiment__document__label</th> <th>retrieved_url</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>7653</td> <td>2</td> <td>1</td> <td>0.085554</td> <td>0.526103</td> <td>0.533085</td> <td>0.148549</td> <td>0.078346</td> <td>-0.323271</td> <td>negative</td> <td>http://blogs.plos.org/speakingofmedicine/2017/...</td> </tr> <tr> <th>1</th> <td>5528</td> <td>2</td> <td>1</td> <td>0.160801</td> <td>0.443317</td> <td>0.596578</td> <td>0.555745</td> <td>0.127581</td> <td>-0.558026</td> <td>negative</td> <td>http://www.cnn.com/2011/HEALTH/01/05/autism.va...</td> </tr> <tr> <th>2</th> <td>11640</td> <td>2</td> <td>2</td> <td>0.062221</td> <td>0.144592</td> <td>0.200812</td> <td>0.151575</td> <td>0.075855</td> <td>-0.628669</td> <td>negative</td> <td>https://en.wikipedia.org/wiki/MMR_vaccine</td> </tr> <tr> <th>3</th> <td>3079</td> <td>2</td> <td>1</td> <td>0.134388</td> <td>0.104364</td> <td>0.254788</td> <td>0.265767</td> <td>0.082326</td> <td>-0.413833</td> <td>negative</td> <td>https://immunize.ca/resources/89</td> </tr> <tr> <th>4</th> <td>3074</td> <td>2</td> <td>1</td> <td>0.125521</td> <td>0.521253</td> <td>0.242785</td> <td>0.559390</td> <td>0.062896</td> <td>-0.234100</td> <td>negative</td> <td>https://www.autismspeaks.org/what-autism/learn...</td> </tr> <tr> <th>5</th> <td>4295</td> <td>2</td> <td>1</td> <td>0.093873</td> <td>0.483575</td> <td>0.509759</td> <td>0.487799</td> <td>0.046805</td> <td>-0.328508</td> <td>negative</td> <td>https://www.cdc.gov/ncbddd/autism/topics.html</td> </tr> <tr> <th>6</th> <td>17654</td> <td>2</td> <td>2</td> <td>0.069062</td> <td>0.512447</td> <td>0.544514</td> <td>0.510827</td> <td>0.078509</td> <td>-0.517533</td> <td>negative</td> <td>https://www.healthychildren.org/English/safety...</td> </tr> </tbody></table>
答案 1 :(得分:0)
嗯,这有点晚了,但是我的同事决心使用递归函数制作解决方案,所以我会分享它。
import json, pandas
with open('emotions.json') as emotions:
emotions = json.load(emotions)
def flattener(my_dict, return_dict={}, mykey=''):
for key,item in my_dict.items():
if isinstance(my_dict[key],dict):
return_dict = flattener(item,return_dict,mykey+'__'+str(key))
else:
return_dict[mykey+'__'+str(key)] = item
return return_dict
dictionary = {}
for key in flattener(emotions[0]).keys():
# get flattened keys and make a new dictionary of lists with them
dictionary[key[2:]] = []
for emotion in emotions:
# get more flattened dictionaries and store them in our
# dictionary of lists
for key, value in flattener(emotion).items():
dictionary[key[2:]].append(value)
# pandas make writing to csv easy
df = pandas.DataFrame(dictionary)
df.to_csv('csv_name.csv')