我正在评估将现有的RDD代码替换为Dataset。对于我的一个用例,我无法将数据集映射到另一个案例类。
这是我想要做的......
case class MyMap(map: Map[String, String])
case class V1(a: String, b: String){
def toMyMap: MyMap = {
MyMap(Map(a->b))
}
def toStr: String = {
a
}
}
object MyApp extends App {
//Get handle to sqlContext and other useful stuff here.
val df1 = sqlContext.createDataset(Seq(V1("2015-05-01", "data1"), V1("2015-05-01", "data2"))).toDF()
df1.as[V1].map(_.toMyMap).show() //Errors out. Added the exception below.
df1.as[V1].map(_.toStr).show() //Works fine.
}
任何帮助都将不胜感激。
有以下例外:
线程中的异常" main" org.apache.spark.SparkException:Job 由于阶段失败而中止:任务不可序列化: java.io.NotSerializableException: scala.reflect.runtime.SynchronizedSymbols $ SynchronizedSymbol $$不久$ 1 序列化堆栈: - 对象不可序列化(类:scala.reflect.runtime.SynchronizedSymbols $ SynchronizedSymbol $$ anon $ 1, 价值:包郎) - field(class:scala.reflect.internal.Types $ ThisType,name:sym,type:class scala.reflect.internal.Symbols $ Symbol) - object(类scala.reflect.internal.Types $ UniqueThisType,java.lang.type) - field(class:scala.reflect.internal.Types $ TypeRef,name:pre,type:class scala.reflect.internal.Types $ Type) - object(类scala.reflect.internal.Types $ ClassNoArgsTypeRef,String) - field(class:scala.reflect.internal.Types $ TypeRef,name:normalized,type:class scala.reflect.internal.Types $ Type) - object(类scala.reflect.internal.Types $ AliasNoArgsTypeRef,String) - field(class:org.apache.spark.sql.catalyst.ScalaReflection $$ anonfun $ 6,name: keyType $ 1,类型:class scala.reflect.api.Types $ TypeApi) - object(类org.apache.spark.sql.catalyst.ScalaReflection $$ anonfun $ 6,) - field(类:org.apache.spark.sql.catalyst.expressions.MapObjects,name:function,type:interface scala.Function1) - object(类org.apache.spark.sql.catalyst.expressions.MapObjects,mapobjects(,invoke(upcast(' map,MapType(StringType,StringType,true), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType)) - field(类:org.apache.spark.sql.catalyst.expressions.Invoke,name:targetObject,type:class org.apache.spark.sql.catalyst.expressions.Expression) - object(类org.apache.spark.sql.catalyst.expressions.Invoke,invoke(mapobjects(,invoke(upcast(' map,MapType(StringType,StringType,true), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;))) - writeObject数据(类:scala.collection.immutable.List $ SerializationProxy) - object(类scala.collection.immutable.List $ SerializationProxy,scala.collection.immutable.List$SerializationProxy@7e78c3cf) - writeReplace data(class:scala.collection.immutable.List $ SerializationProxy) - object(类scala.collection.immutable。$冒号$冒号,List(调用(mapobjects(,调用(upcast(' map,MapType(StringType,StringType,true), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;)), 调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;)))) - field(class:org.apache.spark.sql.catalyst.expressions.StaticInvoke,name: 参数,类型:interface scala.collection.Seq) - object(类org.apache.spark.sql.catalyst.expressions.StaticInvoke, staticinvoke(类 org.apache.spark.sql.catalyst.util.ArrayBasedMapData $,对象类型(接口 scala.collection.Map),toScalaMap中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;))中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;)),真)) - writeObject数据(类:scala.collection.immutable.List $ SerializationProxy) - object(类scala.collection.immutable.List $ SerializationProxy,scala.collection.immutable.List$SerializationProxy@377795c5) - writeReplace data(class:scala.collection.immutable.List $ SerializationProxy) - object(类scala.collection.immutable。$冒号$冒号,List(staticinvoke(类 org.apache.spark.sql.catalyst.util.ArrayBasedMapData $,对象类型(接口 scala.collection.Map),toScalaMap中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;))中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;)),真))) - field(class:org.apache.spark.sql.catalyst.expressions.NewInstance,name: 参数,类型:interface scala.collection.Seq) - object(类org.apache.spark.sql.catalyst.expressions.NewInstance, newinstance(class collector.MyMap,staticinvoke(class。) org.apache.spark.sql.catalyst.util.ArrayBasedMapData $,对象类型(接口 scala.collection.Map),toScalaMap中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;))中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), - field(类:" scala.collection.immutable.Map",name:" map"), - root 类: " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类 [Ljava.lang.Object;)),真),假,对象类型(类 collector.MyMap),无)) - field(class:org.apache.spark.sql.catalyst.encoders.ExpressionEncoder,name: fromRowExpression,输入:class org.apache.spark.sql.catalyst.expressions.Expression) - object(类org.apache.spark.sql.catalyst.encoders.ExpressionEncoder, 类[地图#ExprId(9,255a02aa-f2fa-482d-8cd1-63e2d4d08b30): 地图]) - field(类:org.apache.spark.sql.execution.MapPartitions,name:uEncoder,type:class org.apache.spark.sql.catalyst.encoders.ExpressionEncoder) - object(类org.apache.spark.sql.execution.MapPartitions,!MapPartitions,类[a [0]:string,b [0]:string], 类[地图#ExprId(9,255a02aa-f2fa-482d-8cd1-63e2d4d08b30): 地图],[地图#13] + - LocalTableScan [a#2,b#3],[[0,180000000a,2800000005,2d35302d35313032,3130,3161746164],[0,180000000a,2800000005,2d35302d35313032,3130,3261746164]] ) - field(类:org.apache.spark.sql.execution.MapPartitions $$ anonfun $ 8,name:$ outer, 类型:class org.apache.spark.sql.execution.MapPartitions) - object(类org.apache.spark.sql.execution.MapPartitions $$ anonfun $ 8,) - field(class:org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1,name:f $ 22, type:interface scala.Function1) - object(类org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1, ) - field(class:org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21, name:$ outer,type:class org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1) - 对象(类org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21, ) - field(class:org.apache.spark.rdd.MapPartitionsRDD,name:f,type:interface scala.Function3) - 对象(在CollectorSparkTest.scala展示的org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [1]类:50) - field(class:org.apache.spark.NarrowDependency,name: rdd,type:class org.apache.spark.rdd.RDD) - object(类org.apache.spark.OneToOneDependency,org.apache.spark.OneToOneDependency@110f15b7) - writeObject数据(类:scala.collection.immutable.List $ SerializationProxy) - object(类scala.collection.immutable.List $ SerializationProxy,scala.collection.immutable.List$SerializationProxy@6bb23696) - writeReplace data(class:scala.collection.immutable.List $ SerializationProxy) - object(类scala.collection.immutable。$冒号$冒号,列表(org.apache.spark.OneToOneDependency@110f15b7)) - field(class:org.apache.spark.rdd.RDD,name:org $ apache $ spark $ rdd $ RDD $$ dependencies ,type:interface scala.collection.Seq) - 对象(在CollectorSparkTest.scala展示的org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [2]类:50) - field(类:scala.Tuple2,name:_1,type:class java.lang.Object) - 对象(类scala.Tuple2,(在CollectorSparkTest.scala展示的MapPartitionsRDD [2]:50,))at org.apache.spark.scheduler.DAGScheduler.org $阿帕奇$火花$ $调度$$ DAGScheduler failJobAndIndependentStages(DAGScheduler.scala:1431) 在 org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.适用(DAGScheduler.scala:1419) 在 org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.适用(DAGScheduler.scala:1418) 在 scala.collection.mutable.ResizableArray $ class.foreach(ResizableArray.scala:59) 在scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) 在 org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) 在 org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:1010) 在 org.apache.spark.scheduler.DAGScheduler.org $阿帕奇$火花$ $调度$$ DAGScheduler submitStage(DAGScheduler.scala:921) 在 org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:861) 在 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1607) 在 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) 在 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) 在org.apache.spark.util.EventLoop $$ anon $ 1.run(EventLoop.scala:48) 在 org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) 在org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)at at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)at at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)at at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:212) 在 org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:165) 在 org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:174) 在 org.apache.spark.sql.DataFrame $$ anonfun $ $组织阿帕奇$火花$ SQL $据帧$$执行$ 1 $ 1.适用(DataFrame.scala:1538) 在 org.apache.spark.sql.DataFrame $$ anonfun $ $组织阿帕奇$火花$ SQL $据帧$$执行$ 1 $ 1.适用(DataFrame.scala:1538) 在 org.apache.spark.sql.execution.SQLExecution $ .withNewExecutionId(SQLExecution.scala:56) 在 org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2125) 在 org.apache.spark.sql.DataFrame.org $阿帕奇$火花$ SQL $据帧$$执行$ 1(DataFrame.scala:1537) 在 org.apache.spark.sql.DataFrame.org $阿帕奇$火花$ SQL $数据框$$收集(DataFrame.scala:1544) 在 org.apache.spark.sql.DataFrame $$ anonfun $头$ 1.适用(DataFrame.scala:1414) 在 org.apache.spark.sql.DataFrame $$ anonfun $头$ 1.适用(DataFrame.scala:1413) 在org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2138) 在org.apache.spark.sql.DataFrame.head(DataFrame.scala:1413)at org.apache.spark.sql.DataFrame.take(DataFrame.scala:1495)at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:171)at at org.apache.spark.sql.DataFrame.show(DataFrame.scala:394)at org.apache.spark.sql.Dataset.show(Dataset.scala:228)at org.apache.spark.sql.Dataset.show(Dataset.scala:192)at org.apache.spark.sql.Dataset.show(Dataset.scala:200)
答案 0 :(得分:2)
我认为你可能实际上正在点击SPARK-12696,这已在spark / master中修复。我希望在不久的将来发布1.6.1,其中应该包括这个补丁。
答案 1 :(得分:1)
问题是scala Map类不可序列化,因此Dataset API无法自动生成适当的编码器。我建议将地图转换为字符串,然后解析字符串并转换回地图(假设您在地图中存储字符串)。
数据集API可能也不是最佳选择。我写了this article,这可能是有意义的。