我在spark中使用UserDefinedType时遇到问题,我想定义一个类,然后构建该类的模式,以便可以将其写入镶木地板文件进行存储。但我遇到了问题,我不知道代码是如何引起的。从DataType.scala抛出异常,并显示消息“Unsupported dataType ......”
代码是:
@SQLUserDefinedType(udt = classOf[NodeDataType])
class Node(val dst: Int, val sim: Int) extends Serializable
//{
// def compare(that: Node): Int = that.sim.compareTo(sim)
// override def toString = dst.toString + "," + sim.toString
// override def hashCode(): Int = this.toString.hashCode
// override def equals(other: Any): Boolean = this.toString.equals(other.toString)
//}
class NodeDataType extends UserDefinedType[Node]{
override def sqlType: DataType = StructType(Seq(
StructField("dst", IntegerType, nullable = true),
StructField("sim", IntegerType, nullable = true)
))
override def serialize(obj: Any): GenericArrayData = {
obj match{
case p: Node =>
println("serialize Node")
val output = new Array[Any](2)
output(0) = p.dst
output(1) = p.sim
new GenericArrayData(output)
}
}
override def deserialize(datum: Any): Node = {
datum match{
case values: ArrayData =>
println("deserialize Node")
new Node(values.getInt(0), values.getInt(1))
}
}
override def userClass: Class[Node] = classOf[Node]
override def hashCode(): Int = 1
override def equals(other: Any): Boolean = {
other match{
case that: NodeDataType => true
case _ => false
}
}
}
case object NodeDataType extends NodeDataType
object Test{
def main(args: Array[String]): Unit ={
val conf = new SparkConf().setAppName(this.getClass.getName)
val sc = new SparkContext(conf)
def buildNgbSchema(row: Row): StructType = {
val schema = StructType(Seq(
StructField("src", IntegerType, true),
StructField("neighbours", NodeDataType)
))
println("schema: " + schema)
schema
}
val s1 = Seq(
(1, new Node(1, 1)),
(2, new Node(2, 2))
)
val rdd1 = sc.makeRDD(s1)
val rows = rdd1.map(t => Row(t._1, t._2))
val schema = buildNgbSchema(rows.take(1)(0))
val sqlContext = new SQLContext(sc)
CommonUtility.deletePath("./test") //just delete the path
sqlContext.createDataFrame(rows, schema).write.parquet("./test")
}
}
当我运行此代码时,发生错误:
Caused by: java.lang.IllegalArgumentException: Unsupported dataType: {"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}, [1.1] failure: `TimestampType' expected but `{' found
{"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}
^
at org.apache.spark.sql.types.DataType$CaseClassStringParser$.apply(DataType.scala:245)
at org.apache.spark.sql.types.DataType$.fromCaseClassString(DataType.scala:102)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
at scala.util.Try.getOrElse(Try.scala:77)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$.convertFromString(ParquetTypesConverter.scala:62)
at org.apache.spark.sql.execution.datasources.parquet.RowWriteSupport.init(ParquetTableSupport.scala:51)
at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:288)
at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:262)
at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
......
有没有人遇到这个问题?或者我刚才做错了什么?