如何使用Spark SQL UDT进行查询

时间:2018-11-13 12:25:19

标签: apache-spark

我为LocaleDateTime写了一个UDT,它激发了sql本身不支持的

>>> genexpr = (i >= 0 for i in a)
>>> list(genexpr)
[True]
>>> any(i >= 0 for i in a)
True

然后,我编写一个测试用例进行测试:

class LocalDateTimeUDT extends UserDefinedType[LocalDateTime] {

  override def sqlType: DataType = TimestampType

  override def serialize(obj: LocalDateTime): Any = {
    obj.atZone(ZoneId.systemDefault()).toInstant.toEpochMilli
  }

  override def deserialize(datum: Any): LocalDateTime = {
    println("datum is ..." + datum)
    LocalDateTime.now()
  }

  override def userClass: Class[LocalDateTime] = classOf[LocalDateTime]
}

}

但是,以下异常引发:

 test("SparkSQLTest") {
    val spark = SparkSession.builder().master("local").appName("SparkTest").getOrCreate()
    import spark.implicits._
    UDTRegistration.register(classOf[LocalDateTime].getName, classOf[LocalDateTimeUDT].getName)
    val seq = Seq(LocalDateTime.now(), LocalDateTime.now())
    val rdd = spark.sparkContext.parallelize(seq).map(d => Row.fromSeq(Seq(d)))
    val schema = new StructType().add("udt", new LocalDateTimeUDT())
    val df = spark.createDataFrame(rdd, schema)
    df.printSchema()
    df.show(truncate = false)

    df.createOrReplaceTempView("t")
    // cannot resolve '(t.`udt` > current_timestamp())' due to data type mismatch:
    // differing types in '(t.`udt` > current_timestamp())' (timestamp and timestamp).; line 1 pos 22;
    spark.sql("select * from t where udt > current_timestamp()").show(truncate = false)

我想知道让我的SQL查询(带有过滤器)工作谢谢。

0 个答案:

没有答案