Spark 2.0.2 DataFrame union

时间:2017-01-18 21:39:26

标签: scala apache-spark spark-dataframe

我尝试使用union函数合并2个DataFrame,一个包含旧数据,另一个包含新数据。这曾经有效,直到我尝试向旧的DataFrame动态添加新字段,因为我的架构正在不断发展。

这意味着我的旧数据将丢失一个字段,而新数据将拥有它。为了使联合起作用,我使用下面的evolveSchema函数添加字段。

这导致我粘贴在代码下面的输出/异常,包括我的调试打印。

列排序和制作字段可以通过使DataFrame尽可能相同来尝试解决此问题,但它仍然存在。模式打印显示在这些操作之后它们看起来都是相同的。

任何有关进一步调试的帮助都将不胜感激。

import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, SQLContext}

object Merger {

  def apply(sqlContext: SQLContext, oldDataSet: Option[DataFrame], newEnrichments: Option[DataFrame]): Option[DataFrame] = {

    (oldDataSet, newEnrichments) match {
      case (None, None) => None
      case (None, _) => newEnrichments
      case (Some(existing), None) => Some(existing)
      case (Some(existing), Some(news)) => Some {

        val evolvedOldDataSet = evolveSchema(existing)

        println("EVOLVED OLD SCHEMA FIELD NAMES:" + evolvedOldDataSet.schema.fieldNames.mkString(","))
        println("NEW SCHEMA FIELD NAMES:" + news.schema.fieldNames.mkString(","))

        println("EVOLVED OLD SCHEMA FIELD TYPES:" + evolvedOldDataSet.schema.fields.map(_.dataType).mkString(","))
        println("NEW SCHEMA FIELD TYPES:" + news.schema.fields.map(_.dataType).mkString(","))

        println("OLD SCHEMA")
        existing.printSchema();
        println("PRINT EVOLVED OLD SCHEMA")
        evolvedOldDataSet.printSchema()
        println("PRINT NEW SCHEMA")
        news.printSchema()

        val nullableEvolvedOldDataSet = setNullableTrue(evolvedOldDataSet)
        val nullableNews = setNullableTrue(news)

        println("NULLABLE EVOLVED OLD")
        nullableEvolvedOldDataSet.printSchema()
        println("NULLABLE NEW")
        nullableNews.printSchema()

        val unionData =nullableEvolvedOldDataSet.union(nullableNews)

        val result = unionData.sort(
          unionData("timestamp").desc
        ).dropDuplicates(
          Seq("id")
        )
        result.cache()
      }
    }
  }

  def GENRE_FIELD : String = "station_genre"

  // Handle missing fields in old data
  def evolveSchema(oldDataSet: DataFrame): DataFrame = {
    if (!oldDataSet.schema.fieldNames.contains(GENRE_FIELD)) {

      val columnAdded = oldDataSet.withColumn(GENRE_FIELD, lit("N/A"))

      // Columns should be in the same order for union
      val columnNamesInOrder = Seq("id", "station_id", "station_name", "station_timezone", "station_genre", "publisher_id", "publisher_name", "group_id", "group_name", "timestamp")
      val reorderedColumns = columnAdded.select(columnNamesInOrder.head, columnNamesInOrder.tail: _*)

      reorderedColumns
    }
    else
      oldDataSet
  }

  def setNullableTrue(df: DataFrame) : DataFrame = {
    // get schema
    val schema = df.schema
    // create new schema with all fields nullable
    val newSchema = StructType(schema.map {
      case StructField(columnName, dataType, _, metaData) => StructField( columnName, dataType, nullable = true, metaData)
    })
    // apply new schema
    df.sqlContext.createDataFrame( df.rdd, newSchema )
  }

}
  

演变的旧模式字段名称:   ID,station_id,STATION_NAME,station_timezone,station_genre,PUBLISHER_ID,PUBLISHER_NAME,GROUP_ID,组名,时间戳

     

新计划字段名称:   ID,station_id,STATION_NAME,station_timezone,station_genre,PUBLISHER_ID,PUBLISHER_NAME,GROUP_ID,组名,时间戳

     

演变的旧模式字段类型:   StringType,LongType,StringType,StringType,StringType,LongType,StringType,LongType,StringType,LongType

     

新模式字段类型:   StringType,LongType,StringType,StringType,StringType,LongType,StringType,LongType,StringType,LongType

     

OLD SCHEMA   root | - id:string(nullable = true)| - station_id:   long(nullable = true)| - station_name:string(nullable = true)   | - station_timezone:string(nullable = true)| - publisher_id:long   (nullable = true)| - publisher_name:string(nullable = true)| -   group_id:long(nullable = true)| - group_name:string(nullable =   true)| - timestamp:long(nullable = true)

     

PRINT EVOLVED OLD SCHEMA root | - id:string(nullable = true)| -   station_id:long(nullable = true)| - station_name:string(可为空   = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = false)| - publisher_id:long   (nullable = true)| - publisher_name:string(nullable = true)| -   group_id:long(nullable = true)| - group_name:string(nullable =   true)| - timestamp:long(nullable = true)

     

PRINT NEW SCHEMA root | - id:string(nullable = true)| -   station_id:long(nullable = true)| - station_name:string(可为空   = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = true)| - publisher_id:long   (nullable = true)| - publisher_name:string(nullable = true)| -   group_id:long(nullable = true)| - group_name:string(nullable =   true)| - timestamp:long(nullable = true)

     

NULLABLE EVOLVED OLD root | - id:string(nullable = true)| -   station_id:long(nullable = true)| - station_name:string(可为空   = true)| - station_timezone:string(nullable = true)| - station_genre:string(nullable = true)| - publisher_id:long   (nullable = true)| - publisher_name:string(nullable = true)| -   group_id:long(nullable = true)| - group_name:string(nullable =   true)| - timestamp:long(nullable = true)

     

NULLABLE NEW root | - id:string(nullable = true)| - station_id:   long(nullable = true)| - station_name:string(nullable = true)   | - station_timezone:string(nullable = true)| - station_genre:   string(nullable = true)| - publisher_id:long(nullable = true)   | - publisher_name:string(nullable = true)| - group_id:long   (nullable = true)| - group_name:string(nullable = true)| -   timestamp:long(nullable = true)

     

2017-01-18 15:59:32错误org.apache.spark.internal.Logging $ class   执行程序:91 - 阶段2.0(TID 4)中的任务1.0中的异常   scala.MatchError:false(类java.lang.Boolean)at   org.apache.spark.sql.catalyst.CatalystTypeConverters $ $字符串转换.toCatalystImpl(CatalystTypeConverters.scala:296)     在

...

  

com.companystuff.meta.uploader.Merger $。适用(Merger.scala:49)

...

  

引起:scala.MatchError:false(类java.lang.Boolean)at   org.apache.spark.sql.catalyst.CatalystTypeConverters $ $字符串转换.toCatalystImpl(CatalystTypeConverters.scala:296)   ...

1 个答案:

答案 0 :(得分:0)

这是因为即使其架构相同,也要按实际数据排序。 因此,只需选择所有必需的列,然后进行并集查询即可。

类似这样的东西:

val columns:Seq[String]= ....
val df = oldDf.select(columns:_*).union(newDf.select(columns:_*)

希望它对您有帮助