我一直在尝试使用包含枚举的案例类创建Spark数据集,但我无法做到。我正在使用Spark版本1.6.0。例外是抱怨我的枚举没有找到编码器。这在Spark中不可能在数据中包含枚举吗?
代码:
IFERROR
错误:
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object MyEnum extends Enumeration {
type MyEnum = Value
val Hello, World = Value
}
case class MyData(field: String, other: MyEnum.Value)
object EnumTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlCtx = new SQLContext(sc)
import sqlCtx.implicits._
val df = sc.parallelize(Array(MyData("hello", MyEnum.World))).toDS()
println(s"df: ${df.collect().mkString(",")}}")
}
}
答案 0 :(得分:8)
您可以创建自己的编码器:
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object MyEnum extends Enumeration {
type MyEnum = Value
val Hello, World = Value
}
case class MyData(field: String, other: MyEnum.Value)
object MyDataEncoders {
implicit def myDataEncoder: org.apache.spark.sql.Encoder[MyData] =
org.apache.spark.sql.Encoders.kryo[MyData]
}
object EnumTest {
import MyDataEncoders._
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlCtx = new SQLContext(sc)
import sqlCtx.implicits._
val df = sc.parallelize(Array(MyData("hello", MyEnum.World))).toDS()
println(s"df: ${df.collect().mkString(",")}}")
}
}