使用列表和字典拼合嵌套的json

时间:2019-10-24 16:07:29

标签: python apache-spark pyspark

我正在尝试使用pyspark将以下json扁平化为csv,并且我正在使用以下代码。结果DataFrame显示空值。

{
  "metaHeader": {
    "messageIdentifier": "20191015202734346"
  },
  "projectResults": {
    "projectResultsHeader": {
      "projectNumber": "900025",
      "COANumber": "4457-0"
    },
    "testResultDetails": {
      "SampleDetails": [
        {
          "sampleNumber": "3821",
          "supplierFacilityID": "0024486",
          "productDescription": "Dried_item",
          "ItemNumber": "148",
          "testResults": [
            {
              "testResultsList": [
                {
                  "resultreportedname": "Product Weight",
                  "reportedValue": "5.45",
                  "unit": "lbs"
                },
                {
                  "resultreportedname": "Product Weight",
                  "reportedValue": "1.4",
                  "unit": "g"
                }
              ]
            }
          ]
        }
      ]
    }
  }
}

我已使用此代码将json展平,但最终结果显示了所有空值,如下所示,如何将这种json展平为csv?

|metaHeader_messageIdentifier|projectResults_projectResultsHeader_COANumber|projectResults_projectResultsHeader_projectNumber|sampledetails_CFAItemNumber|sampledetails_productDescription|sampledetails_sampleNumber|sampledetails_supplierFacilityID|sampledetails_tr_ResultsList_reportedValue|sampledetails_tr_ResultsList_resultreportedname|sampledetails_tr_ResultsList_unit|
+----------------------------+---------------------------------------------+-------------------------------------------------+---------------------------+--------------------------------+--------------------------+--------------------------------+------------------------------------------+-----------------------------------------------+---------------------------------+
|                        null|                                         null|                                             null|                       null|                            null|                      null|                            null|                                      null|                                           null|                             null|
|                        null|                                         null|                                             null|                       null|                            null|                      null|                            null|                                      null|                                           null|                             null|
|                        null|                                         null|                                             null|                       null|                            null|                      null|                            null|                                      null|                                           null|                             null|

source_df = spark.read.option('multiline', 'true').json('s3://{BUCKETNAME}/Untitled-1.json')

base_schema = source_df.schema
print('base schema is {}'.format(base_schema))

def flatten_schema(schema):
    """Take schema as returned from schema().jsonValue()
    and return list of field names with full path"""
    def _flatten(schema, path="", accum=None):
        # Extract name of the current element
        name = schema.get("name")
        # If there is a name extend path
        if name is not None:
            path = "{0}.{1}".format(path, name) if path else name
            print('path is {}'.format(path))
        # It is some kind of struct
        if isinstance(schema.get("fields"), list):
            for field in schema.get("fields"):
                _flatten(field, path, accum)
        elif isinstance(schema.get("type"), dict):
            _flatten(schema.get("type"), path, accum)
        # It is an atomic type
        else:
            accum.append(path)
    accum = []
    _flatten(schema, "", accum)
    return  accum

source = 's3://{BUCKETNAME}/Untitled-1.json'

df1         =spark.read.json(source,schema=base_schema)
schema      =df1.schema.jsonValue()


print(f'df1 is {df1}')
print(f'schema is {schema}')

columns_list=flatten_schema(schema)
print('columns_list is {}'.format(columns_list))


df2 = df1.select(*(col(x).alias(x.replace('.','_')) for x in columns_list))
print('df2 is {}'.format(df2))
df3=df2.select("*",explode_outer(df2.projectResults_testResultDetails_SampleDetails).alias("sampledetails"))
df3= df3.drop('projectResults_testResultDetails_SampleDetails')

print(f'df3 is {df3}')
schema3 = df3.schema.jsonValue()
columns_list3 = flatten_schema(schema3)
print(f'columns_list3 is {columns_list3}')

df4 = df3.select(*(col(x).alias(x.replace('.','_')) for x in columns_list3))
df4 = df4.select("*",explode_outer(df4.sampledetails_testResults).alias("sampledetails_tr"))
df4 = df4.drop('sampledetails_testResults')
print(f'df4 is {df4}')

schema4 = df4.schema.jsonValue()
columns_list4 = flatten_schema(schema4)
print(f'columns_list4 is {columns_list4}')

df5 = df4.select(*(col(x).alias(x.replace('.','_')) for x in columns_list4))
df5 =df5.select("*",explode_outer(df5.sampledetails_tr_testResultsList).alias("sampledetails_tr_ResultsList"))
df5 = df5.drop('sampledetails_tr_testResultsList')
print(f'df5 is {df5}')
schema5 = df5.schema.jsonValue()

print(f'schema5 is {schema5}')
columns_list5 = flatten_schema(schema5)
print(f'columns_list5 is {columns_list5}')

df6 = df5.select(*(col(x).alias(x.replace('.','_')) for x in columns_list5))

df6.show()

0 个答案:

没有答案