我正在尝试使用pyspark将以下json扁平化为csv,并且我正在使用以下代码。结果DataFrame显示空值。
{
"metaHeader": {
"messageIdentifier": "20191015202734346"
},
"projectResults": {
"projectResultsHeader": {
"projectNumber": "900025",
"COANumber": "4457-0"
},
"testResultDetails": {
"SampleDetails": [
{
"sampleNumber": "3821",
"supplierFacilityID": "0024486",
"productDescription": "Dried_item",
"ItemNumber": "148",
"testResults": [
{
"testResultsList": [
{
"resultreportedname": "Product Weight",
"reportedValue": "5.45",
"unit": "lbs"
},
{
"resultreportedname": "Product Weight",
"reportedValue": "1.4",
"unit": "g"
}
]
}
]
}
]
}
}
}
我已使用此代码将json展平,但最终结果显示了所有空值,如下所示,如何将这种json展平为csv?
|metaHeader_messageIdentifier|projectResults_projectResultsHeader_COANumber|projectResults_projectResultsHeader_projectNumber|sampledetails_CFAItemNumber|sampledetails_productDescription|sampledetails_sampleNumber|sampledetails_supplierFacilityID|sampledetails_tr_ResultsList_reportedValue|sampledetails_tr_ResultsList_resultreportedname|sampledetails_tr_ResultsList_unit|
+----------------------------+---------------------------------------------+-------------------------------------------------+---------------------------+--------------------------------+--------------------------+--------------------------------+------------------------------------------+-----------------------------------------------+---------------------------------+
| null| null| null| null| null| null| null| null| null| null|
| null| null| null| null| null| null| null| null| null| null|
| null| null| null| null| null| null| null| null| null| null|
source_df = spark.read.option('multiline', 'true').json('s3://{BUCKETNAME}/Untitled-1.json')
base_schema = source_df.schema
print('base schema is {}'.format(base_schema))
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
print('path is {}'.format(path))
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
source = 's3://{BUCKETNAME}/Untitled-1.json'
df1 =spark.read.json(source,schema=base_schema)
schema =df1.schema.jsonValue()
print(f'df1 is {df1}')
print(f'schema is {schema}')
columns_list=flatten_schema(schema)
print('columns_list is {}'.format(columns_list))
df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.projectResults_testResultDetails_SampleDetails).alias("sampledetails"))
df3= df3.drop('projectResults_testResultDetails_SampleDetails')
print(f'df3 is {df3}')
schema3 = df3.schema.jsonValue()
columns_list3 = flatten_schema(schema3)
print(f'columns_list3 is {columns_list3}')
df4 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list3))
df4 = df4.select("*",explode_outer(df4.sampledetails_testResults).alias("sampledetails_tr"))
df4 = df4.drop('sampledetails_testResults')
print(f'df4 is {df4}')
schema4 = df4.schema.jsonValue()
columns_list4 = flatten_schema(schema4)
print(f'columns_list4 is {columns_list4}')
df5 = df4.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
df5 =df5.select("*",explode_outer(df5.sampledetails_tr_testResultsList).alias("sampledetails_tr_ResultsList"))
df5 = df5.drop('sampledetails_tr_testResultsList')
print(f'df5 is {df5}')
schema5 = df5.schema.jsonValue()
print(f'schema5 is {schema5}')
columns_list5 = flatten_schema(schema5)
print(f'columns_list5 is {columns_list5}')
df6 = df5.select(*(col(x).alias(x.replace('.','_')) for x in columns_list5))
df6.show()