使用spark UserDefinedType

时间:2016-07-19 08:59:37

标签: scala apache-spark user-defined-types

我在spark中使用UserDefinedType时遇到问题,我想定义一个类,然后构建该类的模式,以便可以将其写入镶木地板文件进行存储。但我遇到了问题,我不知道代码是如何引起的。从DataType.scala抛出异常,并显示消息“Unsupported dataType ......”

代码是:

@SQLUserDefinedType(udt = classOf[NodeDataType])
class Node(val dst: Int, val sim: Int) extends Serializable
//{
//  def compare(that: Node): Int = that.sim.compareTo(sim)
//  override def toString = dst.toString + "," + sim.toString
//  override def hashCode(): Int = this.toString.hashCode
//  override def equals(other: Any): Boolean = this.toString.equals(other.toString)
//}
class NodeDataType extends UserDefinedType[Node]{
  override def sqlType: DataType = StructType(Seq(
    StructField("dst", IntegerType, nullable = true),
    StructField("sim", IntegerType, nullable = true)
  ))
  override def serialize(obj: Any): GenericArrayData = {
    obj match{
      case p: Node =>
        println("serialize Node")
        val output = new Array[Any](2)
        output(0) = p.dst
        output(1) = p.sim
        new GenericArrayData(output)
    }
  }
  override def deserialize(datum: Any): Node = {
    datum match{
      case values: ArrayData =>
        println("deserialize Node")
        new Node(values.getInt(0), values.getInt(1))
    }
  }
  override def userClass: Class[Node] = classOf[Node]
  override def hashCode(): Int = 1
  override def equals(other: Any): Boolean = {
    other match{
      case that: NodeDataType => true
      case _ => false
    }
  }
}
case object NodeDataType extends NodeDataType

object Test{
  def main(args: Array[String]): Unit ={
    val conf = new SparkConf().setAppName(this.getClass.getName)
    val sc = new SparkContext(conf)

    def buildNgbSchema(row: Row): StructType = {
      val schema = StructType(Seq(
        StructField("src", IntegerType, true),
        StructField("neighbours", NodeDataType)
      ))
      println("schema: " + schema)
      schema
    }

    val s1 = Seq(
      (1, new Node(1, 1)),
      (2, new Node(2, 2))
    )
    val rdd1 = sc.makeRDD(s1)
    val rows = rdd1.map(t => Row(t._1, t._2))
    val schema = buildNgbSchema(rows.take(1)(0))
    val sqlContext = new SQLContext(sc)
    CommonUtility.deletePath("./test")        //just delete the path
    sqlContext.createDataFrame(rows, schema).write.parquet("./test")
  }
}

当我运行此代码时,发生错误:

Caused by: java.lang.IllegalArgumentException: Unsupported dataType: {"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}, [1.1] failure: `TimestampType' expected but `{' found

{"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}
^
    at org.apache.spark.sql.types.DataType$CaseClassStringParser$.apply(DataType.scala:245)
    at org.apache.spark.sql.types.DataType$.fromCaseClassString(DataType.scala:102)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
    at scala.util.Try.getOrElse(Try.scala:77)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$.convertFromString(ParquetTypesConverter.scala:62)
    at org.apache.spark.sql.execution.datasources.parquet.RowWriteSupport.init(ParquetTableSupport.scala:51)
    at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:288)
    at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:262)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
    at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
    at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
    at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:88)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
......

有没有人遇到这个问题?或者我刚才做错了什么?

0 个答案:

没有答案