我有两个数据框,我在其上执行连接,有时我得到以下错误
org.apache.spark.sql.AnalysisException: cannot resolve 'CASE WHEN (`IsAnnualReported_1` IS NOT NULL) THEN `IsAnnualReported_1` ELSE CAST(`IsAnnualReported` AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type;;
现在要克服这个问题,我必须为所有机加工数据类型列手动转换为匹配的数据类型,如下所示。
when($"IsAnnualReported_1".isNotNull, $"IsAnnualReported_1").otherwise($"IsAnnualReported".cast(DataTypes.BooleanType)).as("IsAnnualReported"),
这是我在两个数据帧上执行连接的方式。
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
import org.apache.spark.{ SparkConf, SparkContext }
import java.sql.{Date, Timestamp}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions.input_file_name
import org.apache.spark.sql.functions.regexp_extract
val get_cus_val = spark.udf.register("get_cus_val", (filePath: String) => filePath.split("\\.")(3))
val get_cus_YearPartition = spark.udf.register("get_cus_YearPartition", (filePath: String) => filePath.split("\\.")(4))
val df = sqlContext.read.format("csv").option("header", "true").option("delimiter", "|").option("inferSchema","true").load("s3://trfsmallfffile/FinancialPeriod/MAIN")
val df1With_ = df.toDF(df.columns.map(_.replace(".", "_")): _*)
val column_to_keep = df1With_.columns.filter(v => (!v.contains("^") && !v.contains("!") && !v.contains("_c"))).toSeq
val df1result = df1With_.select(column_to_keep.head, column_to_keep.tail: _*)
val df1resultFinal=df1result.withColumn("DataPartition", get_cus_val(input_file_name))
val df1resultFinalWithYear=df1resultFinal.withColumn("PartitionYear", get_cus_YearPartition(input_file_name))
val df2 = sqlContext.read.format("csv").option("header", "true").option("delimiter", "|").option("inferSchema","true").load("s3://trfsmallfffile/FinancialPeriod/INCR")
val df2With_ = df2.toDF(df2.columns.map(_.replace(".", "_")): _*)
val df2column_to_keep = df2With_.columns.filter(v => (!v.contains("^") && !v.contains("!") && !v.contains("_c"))).toSeq
val df2result = df2With_.select(df2column_to_keep.head, df2column_to_keep.tail: _*)
import org.apache.spark.sql.expressions._
val windowSpec = Window.partitionBy("FinancialPeriod_organizationId", "FinancialPeriod_periodId").orderBy($"TimeStamp".cast(LongType).desc)
val latestForEachKey = df2result.withColumn("rank", rank().over(windowSpec)).filter($"rank" === 1).drop("rank", "TimeStamp")
df1resultFinalWithYear.printSchema()
latestForEachKey.printSchema()
val dfMainOutput = df1resultFinalWithYear.join(latestForEachKey, Seq("FinancialPeriod_organizationId", "FinancialPeriod_periodId"), "outer")
.select($"FinancialPeriod_organizationId", $"FinancialPeriod_periodId",
when($"DataPartition_1".isNotNull, $"DataPartition_1").otherwise($"DataPartition".cast(DataTypes.StringType)).as("DataPartition"),
when($"PartitionYear_1".isNotNull, $"PartitionYear_1").otherwise($"PartitionYear".cast(DataTypes.StringType)).as("PartitionYear"),
when($"FinancialPeriod_periodEndDate_1".isNotNull, $"FinancialPeriod_periodEndDate_1").otherwise($"FinancialPeriod_periodEndDate").as("FinancialPeriod_periodEndDate"),
when($"FinancialPeriod_periodStartDate_1".isNotNull, $"FinancialPeriod_periodStartDate_1").otherwise($"FinancialPeriod_periodStartDate").as("FinancialPeriod_periodStartDate"),
when($"FinancialPeriod_periodDuration_1".isNotNull, $"FinancialPeriod_periodDuration_1").otherwise($"FinancialPeriod_periodDuration").as("FinancialPeriod_periodDuration"),
when($"FinancialPeriod_nonStandardPeriod_1".isNotNull, $"FinancialPeriod_nonStandardPeriod_1").otherwise($"FinancialPeriod_nonStandardPeriod").as("FinancialPeriod_nonStandardPeriod"),
when($"FinancialPeriod_periodType_1".isNotNull, $"FinancialPeriod_periodType_1").otherwise($"FinancialPeriod_periodType").as("FinancialPeriod_periodType"),
when($"PeriodFiscalYear_1".isNotNull, $"PeriodFiscalYear_1").otherwise($"PeriodFiscalYear").as("PeriodFiscalYear"),
when($"PeriodFiscalEndMonth_1".isNotNull, $"PeriodFiscalEndMonth_1").otherwise($"PeriodFiscalEndMonth").as("PeriodFiscalEndMonth"),
when($"IsAnnualReported_1".isNotNull, $"IsAnnualReported_1").otherwise($"IsAnnualReported".cast(DataTypes.BooleanType)).as("IsAnnualReported"),
when($"IsTransitional_1".isNotNull, $"IsTransitional_1").otherwise($"IsTransitional".cast(DataTypes.StringType)).as("IsTransitional"),
when($"CumulativeType_1".isNotNull, $"CumulativeType_1").otherwise($"CumulativeType").as("CumulativeType"),
when($"CalendarizedPeriodEndDate_1".isNotNull, $"CalendarizedPeriodEndDate_1").otherwise($"CalendarizedPeriodEndDate").as("CalendarizedPeriodEndDate"),
when($"EarliestAnnouncementDateTime_1".isNotNull, $"EarliestAnnouncementDateTime_1").otherwise($"EarliestAnnouncementDateTime").as("EarliestAnnouncementDateTime"),
when($"EADUTCOffset_1".isNotNull, $"EADUTCOffset_1").otherwise($"EADUTCOffset").as("EADUTCOffset"),
when($"PeriodPermId_1".isNotNull, $"PeriodPermId_1").otherwise($"PeriodPermId").as("PeriodPermId"),
when($"PeriodPermId_objectTypeId_1".isNotNull, $"PeriodPermId_objectTypeId_1").otherwise($"PeriodPermId_objectTypeId").as("PeriodPermId_objectTypeId"),
when($"PeriodPermId_objectType_1".isNotNull, $"PeriodPermId_objectType_1").otherwise($"PeriodPermId_objectType").as("PeriodPermId_objectType"),
when($"CumulativeTypeId_1".isNotNull, $"CumulativeTypeId_1").otherwise($"CumulativeTypeId").as("CumulativeTypeId"),
when($"PeriodTypeId_1".isNotNull, $"PeriodTypeId_1").otherwise($"PeriodTypeId").as("PeriodTypeId"),
when($"PeriodFiscalEndMonthId_1".isNotNull, $"PeriodFiscalEndMonthId_1").otherwise($"PeriodFiscalEndMonthId").as("PeriodFiscalEndMonthId"),
when($"PeriodLengthUnitId_1".isNotNull, $"PeriodLengthUnitId_1").otherwise($"PeriodLengthUnitId").as("PeriodLengthUnitId"),
when($"FFAction_1".isNotNull, concat(col("FFAction_1"), lit("|!|"))).otherwise(concat(col("FFAction"), lit("|!|"))).as("FFAction"))
.filter(!$"FFAction".contains("D"))
现在我需要的是,如何使用第一个数据帧的模式创建第二个数据帧,这样我就不会收到任何错误,如数据类型不匹配。
这是第一和第二数据框架
的模式root
|-- FinancialPeriod_organizationId: long (nullable = true)
|-- FinancialPeriod_periodId: integer (nullable = true)
|-- FinancialPeriod_periodEndDate: timestamp (nullable = true)
|-- FinancialPeriod_periodStartDate: timestamp (nullable = true)
|-- FinancialPeriod_periodDuration: string (nullable = true)
|-- FinancialPeriod_nonStandardPeriod: string (nullable = true)
|-- FinancialPeriod_periodType: string (nullable = true)
|-- PeriodFiscalYear: integer (nullable = true)
|-- PeriodFiscalEndMonth: integer (nullable = true)
|-- IsAnnualReported: boolean (nullable = true)
|-- IsTransitional: boolean (nullable = true)
|-- CumulativeType: string (nullable = true)
|-- CalendarizedPeriodEndDate: string (nullable = true)
|-- EarliestAnnouncementDateTime: timestamp (nullable = true)
|-- EADUTCOffset: string (nullable = true)
|-- PeriodPermId: string (nullable = true)
|-- PeriodPermId_objectTypeId: string (nullable = true)
|-- PeriodPermId_objectType: string (nullable = true)
|-- CumulativeTypeId: integer (nullable = true)
|-- PeriodTypeId: integer (nullable = true)
|-- PeriodFiscalEndMonthId: integer (nullable = true)
|-- PeriodLengthUnitId: integer (nullable = true)
|-- FFAction: string (nullable = true)
|-- DataPartition: string (nullable = true)
|-- PartitionYear: string (nullable = true)
root
|-- DataPartition_1: string (nullable = true)
|-- PartitionYear_1: integer (nullable = true)
|-- FinancialPeriod_organizationId: long (nullable = true)
|-- FinancialPeriod_periodId: integer (nullable = true)
|-- FinancialPeriod_periodEndDate_1: timestamp (nullable = true)
|-- FinancialPeriod_periodStartDate_1: timestamp (nullable = true)
|-- FinancialPeriod_periodDuration_1: string (nullable = true)
|-- FinancialPeriod_nonStandardPeriod_1: string (nullable = true)
|-- FinancialPeriod_periodType_1: string (nullable = true)
|-- PeriodFiscalYear_1: string (nullable = true)
|-- PeriodFiscalEndMonth_1: string (nullable = true)
|-- IsAnnualReported_1: string (nullable = true)
|-- IsTransitional_1: string (nullable = true)
|-- CumulativeType_1: string (nullable = true)
|-- CalendarizedPeriodEndDate_1: string (nullable = true)
|-- EarliestAnnouncementDateTime_1: string (nullable = true)
|-- EADUTCOffset_1: string (nullable = true)
|-- PeriodPermId_1: string (nullable = true)
|-- PeriodPermId_objectTypeId_1: string (nullable = true)
|-- PeriodPermId_objectType_1: string (nullable = true)
|-- CumulativeTypeId_1: string (nullable = true)
|-- PeriodTypeId_1: string (nullable = true)
|-- PeriodFiscalEndMonthId_1: string (nullable = true)
|-- PeriodLengthUnitId_1: string (nullable = true)
|-- FFAction_1: string (nullable = true)
答案 0 :(得分:1)
你已经有了一个很好的解决方案。
在这里,我将向您展示如何避免为类型转换手动编写每个列。
假设你有两个dataframes
(正如你已经拥有的那样)
df1
root
|-- col1: integer (nullable = false)
|-- col2: string (nullable = true)
df2
root
|-- cl2: integer (nullable = false)
|-- cl1: integer (nullable = false)
假设您要将dataTypes
的{{1}}更改为df2
的{{1}}。正如您所说,您知道df1
的每个列的映射。您必须创建列
dataframes
Map
如果您拥有上述val columnMaps = Map("col1" -> "cl1", "col2"->"cl2")
,则可以将map
设置为dataTypes
的每个列,如下所示
df2
然后,您可以通过调用递归函数来更改val schema1 = df1.schema
val toBeChangedDataTypes =df1.schema.map(x => if(columnMaps.keySet.contains(x.name)) (columnMaps(x.name), x.dataType) else (x.name, x.dataType)).toList
的列的dataTypes
以与df2
匹配/ p>
df1
其中val finalDF = castingFunction(toBeChangedDataTypes, df2)
是递归函数,定义为
castingFunction
您会看到import org.apache.spark.sql.functions.col
def castingFunction(typeList: List[Tuple2[String, DataType]], df: DataFrame) : DataFrame = typeList match {
case x :: y => castingFunction(y, df.withColumn(x._1, col(x._1).cast(x._2)))
case Nil => df
}
将架构设为
finalDF
您可以为数据框执行相同的操作。
我希望答案很有帮助