我有一个包含数组的嵌套模式:
root
|-- alarm_time: string (nullable = true)
|-- alarm_id: string (nullable = true)
|-- user: struct (nullable = true)
| |-- name: string (nullable = true)
| |-- family: string (nullable = true)
| |-- address: struct (nullable = true)
| | |-- postalcode: string (nullable = true)
| | |-- line1: string (nullable = true)
| | |-- city: string (nullable = true)
| | |-- country: string (nullable = true)
|-- device: struct (nullable = true)
| |-- device_usage: string (nullable = true)
| |-- device_id: string (nullable = true)
|-- alarm_info: struct (nullable = true)
| |-- type: string (nullable = true)
| |-- reason: string (nullable = true)
| |-- data: struct (nullable = true)
| | |-- alarm_severity: long (nullable = true)
| | |-- extra_info: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- producer: string (nullable = true)
| | | | |-- comment: string (nullable = true)
我曾经忽略数组字段并使用此代码来展平我的架构:
def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = {
schema.fields.flatMap(f => {
val colName = if (prefix == null) f.name else (prefix + "." + f.name)
f.dataType match {
case st: StructType => flattenSchema(st, colName)
case _ => Array(col(colName))
}
})
}
并像df.select(flattenSchema(df.schema):_*)
一样使用它,但现在我有一个需要保留数组数据的用例,我唯一能想到的是爆炸数组并保持多行,但我没有运气。由于我将列作为args参数传递,因此无法传递另一个参数。
如何实现这一点(使用展开的数组进行扁平化架构?)
答案 0 :(得分:2)
Am1rr3zA,如果我们有两个处于同一级别的阵列,则您提供的解决方案将失效。它将不允许同时发生两次爆炸:“每个select子句只允许一个生成器,但发现2:explode(_1),explode(_2)”
我已经更新了解决方案以跟踪嵌套中的复杂类型
def flattenDataFrame(df: DataFrame): DataFrame = {
var flattenedDf: DataFrame = df
if (isNested(df)) {
val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(df.schema)
var simpleColumns: List[Column] = List.empty[Column]
var complexColumns: List[Column] = List.empty[Column]
flattenedSchema.foreach {
case (col, isComplex) => {
if (isComplex) {
complexColumns = complexColumns :+ col
} else {
simpleColumns = simpleColumns :+ col
}
}
}
var crossJoinedDataFrame = df.select(simpleColumns: _*)
complexColumns.foreach(col => {
crossJoinedDataFrame = crossJoinedDataFrame.crossJoin(df.select(col))
crossJoinedDataFrame = flattenDataFrame(crossJoinedDataFrame)
})
crossJoinedDataFrame
} else {
flattenedDf
}
}
private def flattenSchema(schema: StructType, prefix: String = null): Array[(Column, Boolean)] = {
schema.fields.flatMap(field => {
val columnName = if (prefix == null) field.name else prefix + "." + field.name
field.dataType match {
case arrayType: ArrayType => {
val cols: Array[(Column, Boolean)] = Array[(Column, Boolean)](((explode_outer(col(columnName)).as(columnName.replace(".", "_"))), true))
cols
}
case structType: StructType => {
flattenSchema(structType, columnName)
}
case _ => {
val columnNameWithUnderscores = columnName.replace(".", "_")
val metadata = new MetadataBuilder().putString("encoding", "ZSTD").build()
Array(((col(columnName).as(columnNameWithUnderscores, metadata)), false))
}
}
}).filter(field => field != None)
}
def isNested(df: DataFrame): Boolean = {
df.schema.fields.flatMap(field => {
field.dataType match {
case arrayType: ArrayType => {
Array(true)
}
case mapType: MapType => {
Array(true)
}
case structType: StructType => {
Array(true)
}
case _ => {
Array(false)
}
}
}).exists(b => b)
}
答案 1 :(得分:0)
所以我现在正在做的事情( Spark 2.2 + )是检查架构是否嵌套并反复调用flattenschema
直到它变得扁平化。
def makeItFlat(df: DataFrame): DataFrame = {
if (isSchemaNested(df)) {
val flattenedSchema = flattenSchema(df.schema)
makeItFlat(df.select(flattenedSchema: _*))
}
else {
df
}
}
makeItFlat()是一个递归方法,用于检查架构是否未展平,还是以递归方式再次调用flattenschema
def isSchemaNested(df: DataFrame): Boolean = {
df.schema.fields.flatMap(field => {
field.dataType match {
case arrayType: ArrayType => {
Array(true)
}
case mapType: MapType => {
Array(true)
}
case structType: StructType => {
Array(true)
}
case _ => {
Array(false)
}
}
}).exists(b => b)
}
isSchemaNested的工作是检查架构的defenition中是否存在任何嵌套数据类型
private def flattenSchema(schema: StructType, prefix: String = null): Array[Column] = {
schema.fields.flatMap(field => {
val columnName = if (prefix == null) field.name else prefix + "." + field.name
field.dataType match {
case arrayType: ArrayType => {
Array[Column](explode_outer(col(columnName)).as(columnName.replace(".", "_")))
}
case mapType: MapType => {
None
}
case structType: StructType => {
flattenSchema(structType, columnName)
}
case _ => {
val columnNameWithUnderscores = columnName.replace(".", "_")
val metadata = new MetadataBuilder().putString("encoding", "ZSTD").build()
Array(col(columnName).as(columnNameWithUnderscores, metadata))
}
}
}).filter(field => field != None)
}