我一直致力于从一系列表格中创建MongoDB。
我已将表连接成这样的数据框:
subid,firstvisit,name,contact,dob,gender,visitdate1,age,visitcategory,samplenumber,label_on_sample,completed_by
1,12/31/11,Bob,,12/31/00,Male,,,,,,
1,,,,,,12/31/15,17,Baseline Visit,,,
1,,,,,,12/31/16,18,Follow Up Visit,,,
1,,,,,,12/31/17,18,Follow Up Visit,,,
1,,,,12/31/00,Male,,17,,XXX123,1,Sally
2,1/1/12,,,1/1/01,Female,,,,,,
2,,,,,,1/1/11,10,Baseline Visit,,,
2,,,,,,1/1/12,11,Follow Up Visit,,,
2,,,,,,1/1/13,12,Follow Up Visit,,,
2,,,,,,1/1/14,13,Follow Up Visit,,,
2,,,,,,1/1/15,14,Follow Up Visit,,,
2,,,,1/1/01,Female,,15,,YYY456,2,
2,,,,1/1/01,Female,,15,,ZZZ789,2,Sally'
输出如下:
[
{
"subject_id": "1",
"name": "Bob",
"dob": "12/31/00",
"gender": "Male",
"visits": {
"12/31/15": {
"age": "17",
"visit_category": "Baseline Visit"
},
"12/31/16": {
"age": "18",
"visit_category": "Follow Up Visit"
},
"12/31/17": {
"age": "18",
"visit_category": "Follow Up Visit"
}
},
"samples": {
"XXX123": {
"completed_by": "Sally",
"label_on_sample": "1"
}
}
},
{
"subject_id": "2",
"name": null,
"dob": "1/1/01",
"gender": "Female",
"visits": {
"1/1/11": {
"age": "10",
"visit_category": "Baseline Visit"
},
"1/1/12": {
"age": "11",
"visit_category": "Follow Up Visit"
},
"1/1/13": {
"age": "12",
"visit_category": "Follow Up Visit"
},
"1/1/14": {
"age": "13",
"visit_category": "Follow Up Visit"
},
"1/1/15": {
"age": "14",
"visit_category": "Follow Up Visit"
}
},
"samples": {
"YYY456": {
"completed_by": null,
"label_on_sample": "2"
},
"ZZZ789": {
"completed_by": "Sally",
"label_on_sample": "2"
}
}
}
]
使用这样的函数:
def solution(df):
by_subject_id = defaultdict(lambda: {
'name': None,
'dob': None,
'gender': None,
'visits': {},
'samples': {}
})
for row in df.iterows():
non_empty = {k: v for k, v in row[1].items() if v != ''}
subject_id = non_empty['subid'] # must have to group by
first_visit = non_empty.get('firstvisit') # optional
sample = non_empty.get('samplenumber') # optional
visit = non_empty.get('visitdate1') # optional
if first_visit:
by_subject_id[subject_id].update({
'name': non_empty.get('name'),
'dob': non_empty.get('dob'),
'gender': non_empty.get('gender')
})
elif visit:
by_subject_id[subject_id]['visits'][visit] = {
'age': non_empty.get('age'),
'visit_category': non_empty.get('visitcategory')
}
elif sample:
by_subject_id[subject_id]['samples'][sample] = {
'completed_by': non_empty.get('completed_by'),
'label_on_sample': non_empty.get('label_on_sample')
}
return [{'subject_id': k, **v} for k, v in by_subject_id.items()]
result = solution(df)
with open('result.json', 'w') as outfile:
json.dump(result, outfile)
如果我只想从基线访问中收集信息并将其存储在主题ID下,例如" age_at_baseline"例如。
[
{
"subject_id": "1",
"name": "Bob",
"dob": "12/31/00",
"gender": "Male",
"age_at_baseline": "17"
"visits": {
"12/31/15": {
"age": "17",
"visit_category": "Baseline Visit"
},
"12/31/16": {
"age": "18",
"visit_category": "Follow Up Visit"
},
"12/31/17": {
"age": "18",
"visit_category": "Follow Up Visit"
}
},
捕获它的最佳方法是什么?我一直在玩条件,但它要么破坏我的代码,要么运行正常,但记录为空。
感谢您的任何建议。