我有3个数据框,我已将它们合并为一个,并希望将数据框表示为嵌套字典/ json格式。
df1:其中包含有关患者的一般信息。
>>> df1 = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
>>> df1
PatientId Gender Marital_status
1 M married
2 F unmarried
df2: 其中包含每次患者入院和诊断的详细信息。
>>> df2 = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
>>> df2
PatientId AdmissionId Diagnosis_Code Stay_Duration
1 1 DXS 45
1 2 SDE 14
2 1 DEF 79
2 2 ATR 32
df3: 该数据框包含每次入院患者的所有实验室睾丸报告。
>>> df3 = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)
>>> df3
PatientId AdmissionId LabTest LabName LabValue IsNormal
1 1 1 ABC 5.7 True
1 1 1 XYZ 1.9 False
1 1 2 ABC 5.6 True
1 1 2 XYZ 2.4 True
1 2 1 PQR 5.7 True
1 2 1 XYZ 1.9 False
1 2 2 ABC 5.6 True
1 2 2 XYZ 2.4 True
2 1 1 ABC 5.7 True
2 1 1 XYZ 1.9 False
2 1 2 ABC 5.6 True
2 1 2 XYZ 2.4 True
2 2 1 PQR 5.7 True
2 2 1 XYZ 1.9 False
2 2 2 ABC 5.6 True
2 2 2 PQR 2.4 True
我希望我的输出看起来像这样-
"PatientId" : 1
"Gender":M
"Marital_Status" : married
"AdmissionsInfo":
{
"AdmissionID": 1
"Diagnosis": DXS
"Stay_Duration" : 45
"lab reports" :
{
"labtest":1
"labinfo":
{
"labName":ABC
"labValue":5.6
"isNormal":True
},
{
"labName": XYZ
"labValue": 2.4
"isNormal": True
}
"labtest":2
"labinfo":
{
"labName":ABC
"labValue":5.7
"isNormal":True
},
{
"labName": XYZ
"labValue": 1.9
"isNormal":False
}
}
"AdmissionID": 2
"Diagnosis": SDE
"Stay_Duration" : 45
/
/
//
} end of patient 1's all admissions' info
"PatientId" : 2
"Gender": F
"Marital_Status" : unmarried
"AdmissionsInfo":
//
//
and so on }}}
答案 0 :(得分:0)
在下面找到完整的熊猫(尽管有冗余)解决方案。
首先将您的三个数据帧合并为一个被调用的df_merged
:
df_merged = df3.merge(df1, on="PatientId").merge(df2, on=["PatientId", "AdmissionId"])
现在创建您需要的层次结构(这部分很难看,但是很高兴收到有关它的反馈):
(df_merged.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabTest"])
.apply(lambda x: x[["LabName", "LabValue", "IsNormal"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabInfo"})
.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration"])
.apply(lambda x: x[["LabTest", "LabInfo"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabReports"})
.groupby(["PatientId", "Gender", "Marital_status"])
.apply(lambda x: x[["AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabReports"]].to_dict("r"))
.reset_index()
.rename(columns={0:"AdmissionsInfo"})
.to_json(orient="records"))
并将其转储到json
对象中:
>>> import json
>>> print(json.dumps(json.loads(j), indent=2, sort_keys=False))
结果:
[
{
"PatientId": 1,
"Gender": "M",
"Marital_status": "married",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DXS",
"Stay_Duration": "45",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "SDE",
"Stay_Duration": "14",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
},
{
"PatientId": 2,
"Gender": "F",
"Marital_status": "unmarried",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DEF",
"Stay_Duration": "79",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "ATR",
"Stay_Duration": "32",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "PQR",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
}
]
答案 1 :(得分:0)
In [84]: json_list = []
In [85]: for index1, row1 in df1.iterrows():
...: d = dict(row1)
...: json_list.append(d)
...: for index2, row2 in df2[df2['PatientId'] == row1['PatientId']].iterrows():
...: d2 = dict(row2)
...: del d2['PatientId']
...: d['AdmissionsInfo'] = d2
...: lab_reports_list = []
...: for index3, row3 in df3[(df3['PatientId'] == row2['PatientId']) & (df3['AdmissionId'] == row2['AdmissionId'])].iterrows():
...: d3 = dict(row3)
...: d4 = {}
...: d4['labtest'] = row3['LabTest']
...: d4['labinfo'] = {'labName': row3['LabName'], 'labValue': row3['LabValue'], 'isNormal': row3['IsNormal']}
...: lab_reports_list.append(d4)
...: d2['lab reports'] = lab_reports_list
输出
In [87]: json_list
Out[87]:
[{'PatientId': 1,
'Gender': 'M',
'Marital_status': 'married',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'SDE',
'Stay_Duration': 14,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'XYZ', 'labValue': 2.4, 'isNormal': True}}]}},
{'PatientId': 2,
'Gender': 'F',
'Marital_status': 'unmarried',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'ATR',
'Stay_Duration': 32,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'PQR', 'labValue': 2.4, 'isNormal': True}}]}}]
答案 2 :(得分:-1)
Arnaud的反应可以完成任务,但看起来不是很“ Pythonic”。
DataFrameGroupby上的聚合函数可能有可能。尝试使用.agg(dict),但不起作用。
对于那些想要帮助的人:
patient_df = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
admission_df = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
lab_df = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)