我有一个嵌套的JSON,通过使用以下功能,我可以完全弄平
# Flatten nested df
def flatten_df(nested_df):
for col in nested_df.columns:
array_cols = [ c[0] for c in nested_df.dtypes if c[1][:5] == 'array']
for col in array_cols:
nested_df =nested_df.withColumn(col, F.explode_outer(nested_df[col]))
nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
if len(nested_cols) == 0:
return nested_df
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
flat_df = nested_df.select(flat_cols +
[F.col(nc+'.'+c).alias(nc+'_'+c)
for nc in nested_cols
for c in nested_df.select(nc+'.*').columns])
return flatten_df(flat_df)
我想分解嵌套结构,但不想一直展平。我只想展平到第一层,并保持随后的嵌套结构不变。
这是我正在使用的数据框的架构。
root
|-- module: array (nullable = true)
| |-- element: array (containsNull = true)
| | |-- element: struct (containsNull = true)
| | | |-- chart: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- header: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- alt: string (nullable = true)
| | | | | | | |-- assetId: string (nullable = true)
| | | | | | | |-- header: string (nullable = true)
| | | | | | | |-- height: string (nullable = true)
| | | | | | | |-- linkedid: string (nullable = true)
| | | | | | | |-- selected: boolean (nullable = true)
| | | | | | | |-- src: string (nullable = true)
| | | | | | | |-- styleCodes: string (nullable = true)
| | | | | | | |-- viewLarger: string (nullable = true)
| | | | | | | |-- width: string (nullable = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- row: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- alt: string (nullable = true)
| | | | | | | |-- label: string (nullable = true)
| | | | | | | |-- value: array (nullable = true)
| | | | | | | | |-- element: string (containsNull = true)
| | | |-- header: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- decorators: array (nullable = true)
| | | | | | |-- element: string (containsNull = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- value: string (nullable = true)
| | | |-- id: string (nullable = true)
| | | |-- image: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- alt: string (nullable = true)
| | | | | |-- assetId: string (nullable = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- originalSrc: string (nullable = true)
| | | | | |-- src: string (nullable = true)
| | | | | |-- styleCodes: string (nullable = true)
| | | | | |-- viewLarger: boolean (nullable = true)
| | | |-- paragraph: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- decorators: array (nullable = true)
| | | | | | |-- element: array (containsNull = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- style.bold: struct (nullable = true)
| | | | | | | | | |-- length: long (nullable = true)
| | | | | | | | | |-- offset: long (nullable = true)
| | | | | |-- id: string (nullable = true)
| | | | | |-- value: array (nullable = true)
| | | | | | |-- element: string (containsNull = true)
我想要的最终数据帧是
module_id | module_header | paragraph_id | paragraph_value | image_id | image_src| chart_id | chart_header | chart_row |
这里'module_id'是模块数组下的'id',module_header是模块数组下的头数组,依此类推。我需要在第一级停止拼合,而不要一直到最后。