我尝试使用union函数合并2个DataFrame,一个包含旧数据,另一个包含新数据。这曾经有效,直到我尝试向旧的DataFrame动态添加新字段,因为我的架构正在不断发展。
这意味着我的旧数据将丢失一个字段,而新数据将拥有它。为了使联合起作用,我使用下面的evolveSchema函数添加字段。
这导致我粘贴在代码下面的输出/异常,包括我的调试打印。
列排序和制作字段可以通过使DataFrame尽可能相同来尝试解决此问题,但它仍然存在。模式打印显示在这些操作之后它们看起来都是相同的。
任何有关进一步调试的帮助都将不胜感激。
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, SQLContext}
object Merger {
def apply(sqlContext: SQLContext, oldDataSet: Option[DataFrame], newEnrichments: Option[DataFrame]): Option[DataFrame] = {
(oldDataSet, newEnrichments) match {
case (None, None) => None
case (None, _) => newEnrichments
case (Some(existing), None) => Some(existing)
case (Some(existing), Some(news)) => Some {
val evolvedOldDataSet = evolveSchema(existing)
println("EVOLVED OLD SCHEMA FIELD NAMES:" + evolvedOldDataSet.schema.fieldNames.mkString(","))
println("NEW SCHEMA FIELD NAMES:" + news.schema.fieldNames.mkString(","))
println("EVOLVED OLD SCHEMA FIELD TYPES:" + evolvedOldDataSet.schema.fields.map(_.dataType).mkString(","))
println("NEW SCHEMA FIELD TYPES:" + news.schema.fields.map(_.dataType).mkString(","))
println("OLD SCHEMA")
existing.printSchema();
println("PRINT EVOLVED OLD SCHEMA")
evolvedOldDataSet.printSchema()
println("PRINT NEW SCHEMA")
news.printSchema()
val nullableEvolvedOldDataSet = setNullableTrue(evolvedOldDataSet)
val nullableNews = setNullableTrue(news)
println("NULLABLE EVOLVED OLD")
nullableEvolvedOldDataSet.printSchema()
println("NULLABLE NEW")
nullableNews.printSchema()
val unionData =nullableEvolvedOldDataSet.union(nullableNews)
val result = unionData.sort(
unionData("timestamp").desc
).dropDuplicates(
Seq("id")
)
result.cache()
}
}
}
def GENRE_FIELD : String = "station_genre"
// Handle missing fields in old data
def evolveSchema(oldDataSet: DataFrame): DataFrame = {
if (!oldDataSet.schema.fieldNames.contains(GENRE_FIELD)) {
val columnAdded = oldDataSet.withColumn(GENRE_FIELD, lit("N/A"))
// Columns should be in the same order for union
val columnNamesInOrder = Seq("id", "station_id", "station_name", "station_timezone", "station_genre", "publisher_id", "publisher_name", "group_id", "group_name", "timestamp")
val reorderedColumns = columnAdded.select(columnNamesInOrder.head, columnNamesInOrder.tail: _*)
reorderedColumns
}
else
oldDataSet
}
def setNullableTrue(df: DataFrame) : DataFrame = {
// get schema
val schema = df.schema
// create new schema with all fields nullable
val newSchema = StructType(schema.map {
case StructField(columnName, dataType, _, metaData) => StructField( columnName, dataType, nullable = true, metaData)
})
// apply new schema
df.sqlContext.createDataFrame( df.rdd, newSchema )
}
}
演变的旧模式字段名称: ID,station_id,STATION_NAME,station_timezone,station_genre,PUBLISHER_ID,PUBLISHER_NAME,GROUP_ID,组名,时间戳
新计划字段名称: ID,station_id,STATION_NAME,station_timezone,station_genre,PUBLISHER_ID,PUBLISHER_NAME,GROUP_ID,组名,时间戳
演变的旧模式字段类型: StringType,LongType,StringType,StringType,StringType,LongType,StringType,LongType,StringType,LongType
新模式字段类型: StringType,LongType,StringType,StringType,StringType,LongType,StringType,LongType,StringType,LongType
OLD SCHEMA root | - id:string(nullable = true)| - station_id: long(nullable = true)| - station_name:string(nullable = true) | - station_timezone:string(nullable = true)| - publisher_id:long (nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable = true)
PRINT EVOLVED OLD SCHEMA root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(可为空 = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = false)| - publisher_id:long (nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable = true)
PRINT NEW SCHEMA root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(可为空 = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = true)| - publisher_id:long (nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable = true)
NULLABLE EVOLVED OLD root | - id:string(nullable = true)| - station_id:long(nullable = true)| - station_name:string(可为空 = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = true)| - publisher_id:long (nullable = true)| - publisher_name:string(nullable = true)| - group_id:long(nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable = true)
NULLABLE NEW root | - id:string(nullable = true)| - station_id: long(nullable = true)| - station_name:string(nullable = true) | - station_timezone:string(nullable = true)| - station_genre: string(nullable = true)| - publisher_id:long(nullable = true) | - publisher_name:string(nullable = true)| - group_id:long (nullable = true)| - group_name:string(nullable = true)| - timestamp:long(nullable = true)
2017-01-18 15:59:32错误org.apache.spark.internal.Logging $ class 执行程序:91 - 阶段2.0(TID 4)中的任务1.0中的异常 scala.MatchError:false(类java.lang.Boolean)at org.apache.spark.sql.catalyst.CatalystTypeConverters $ $字符串转换.toCatalystImpl(CatalystTypeConverters.scala:296) 在
...
com.companystuff.meta.uploader.Merger $。适用(Merger.scala:49)
...
引起:scala.MatchError:false(类java.lang.Boolean)at org.apache.spark.sql.catalyst.CatalystTypeConverters $ $字符串转换.toCatalystImpl(CatalystTypeConverters.scala:296) ...
答案 0 :(得分:0)
这是因为即使其架构相同,也要按实际数据排序。 因此,只需选择所有必需的列,然后进行并集查询即可。
类似这样的东西:
val columns:Seq[String]= ....
val df = oldDf.select(columns:_*).union(newDf.select(columns:_*)
希望它对您有帮助