我曾尝试使用flatten_json库对其进行扁平化,但在弄清楚如何扁平化“项目”和“阶段”键时会遇到麻烦,这样它们就不会导致额外的“列”。
我从rest API中得到类似于以下JSON的内容:
results = requests.get(apiUrl, verify = rootCaCert, headers = headers, params = httpParams)
for i in list(results.json()):
outputContent = flatten(i)
outputJson.append(outputContent)
findingsFrame = pandas.Series(outputJson).to_frame()
sparkSession = SparkSession.builder \
.appName('DEV Canopy Feed') \
.getOrCreate()
df = sparkSession.createDataFrame(findingsFrame)
输入JSON如下所示:
[
{
"status": "Open",
"project_finding__id": 23,
"rating__type": "Medium",
"title": "Some string title"
"date_modified": "2017-04-10T15:04:32.527000Z",
"cvss3_score": null,
"template_finding_id": null,
"project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
"references": [
{
"type": "system",
"reference": "id-123",
"title": "some reference title"
}
],
"date_created": "2017-04-10T15:04:32.527000Z",
"phase": {
"status": "Completed",
"end_date": "2017-03-31",
"uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
"reference": "2017-1234",
"title": "some title for phase",
"notes": "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
"contacts": [
{
"notes": "<p>Random Note about contact</p>",
"role": "Primary Requestor",
"email": "user@domain.tld"
},
{
"notes": "<p>Random Note about contact</p>",
"role": "other role",
"email": "user2@domain.tld"
}
],
"start_date": "2017-02-20",
"project": {
"description": ""some description of project"",
"reference": "123",
"title": "Title of Project",
"company": {
"abbreviation": "",
"description": "",
"id": 345,
"name": "BUSU",
"uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
},
"id": 567,
"uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
},
"type": "Client Server",
"id": 12312312,
"description": "Block of text string"
},
"cvss2_score": "2.0",
"first_date_created": "2017-04-10T15:04:32.527000Z",
"first_phase_reference": "2017-1234"
答案 0 :(得分:0)
这是我过去用来拉平嵌套json的函数。
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
您可以尝试一下,看看它是否提供了您想要的输出(我不知道您期望的输出是什么,正如评论中所建议的那样,您可能想要包含它):
data = [
{
"status": "Open",
"project_finding__id": 23,
"rating__type": "Medium",
"title": "Some string title",
"date_modified": "2017-04-10T15:04:32.527000Z",
"cvss3_score": None,
"template_finding_id": None,
"project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
"references": [
{
"type": "system",
"reference": "id-123",
"title": "some reference title"
}
],
"date_created": "2017-04-10T15:04:32.527000Z",
"phase": {
"status": "Completed",
"end_date": "2017-03-31",
"uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
"reference": "2017-1234",
"title": "some title for phase",
"notes": "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
"contacts": [
{
"notes": "<p>Random Note about contact</p>",
"role": "Primary Requestor",
"email": "user@domain.tld"
},
{
"notes": "<p>Random Note about contact</p>",
"role": "other role",
"email": "user2@domain.tld"
}
],
"start_date": "2017-02-20",
"project": {
"description": ""some description of project"",
"reference": "123",
"title": "Title of Project",
"company": {
"abbreviation": "",
"description": "",
"id": 345,
"name": "BUSU",
"uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
},
"id": 567,
"uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
},
"type": "Client Server",
"id": 12312312,
"description": "Block of text string"
},
"cvss2_score": "2.0",
"first_date_created": "2017-04-10T15:04:32.527000Z",
"first_phase_reference": "2017-1234"}]
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data)
import pandas as pd
import re
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
输出:
print (results.to_string())
type reference title notes role email 0_status 0_project_finding__id 0_rating__type 0_title 0_date_modified 0_cvss3_score 0_template_finding_id 0_project_finding__uuid 0_date_created 0_phase_status 0_phase_end_date 0_phase_uuid 0_phase_reference 0_phase_title 0_phase_notes 0_phase_start_date 0_phase_project_description 0_phase_project_reference 0_phase_project_title 0_phase_project_company_abbreviation 0_phase_project_company_description 0_phase_project_company_id 0_phase_project_company_name 0_phase_project_company_uuid 0_phase_project_id 0_phase_project_uuid 0_phase_type 0_phase_id 0_phase_description 0_cvss2_score 0_first_date_created 0_first_phase_reference
0 system id-123 some reference title <p>Random Note about contact</p> Primary Requestor user@domain.tld Open 23 Medium Some string title 2017-04-10T15:04:32.527000Z None None ba42302e-b879-11e9-a2a3-2a2ae2dbcce4 2017-04-10T15:04:32.527000Z Completed 2017-03-31 ba423506-b879-11e9-a2a3-2a2ae2dbcce4 2017-1234 some title for phase <p><strong>Some Text</strong>: in HTML</p><br/... 2017-02-20 "some description of project" 123 Title of Project 345 BUSU ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4 567 20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4 Client Server 12312312 Block of text string 2.0 2017-04-10T15:04:32.527000Z 2017-1234
1 NaN NaN NaN <p>Random Note about contact</p> other role user2@domain.tld Open 23 Medium Some string title 2017-04-10T15:04:32.527000Z None None ba42302e-b879-11e9-a2a3-2a2ae2dbcce4 2017-04-10T15:04:32.527000Z Completed 2017-03-31 ba423506-b879-11e9-a2a3-2a2ae2dbcce4 2017-1234 some title for phase <p><strong>Some Text</strong>: in HTML</p><br/... 2017-02-20 "some description of project" 123 Title of Project 345 BUSU ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4 567 20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4 Client Server 12312312 Block of text string 2.0 2017-04-10T15:04:32.527000Z 2017-1234