Spark 1.6

时间:2016-01-11 07:10:39

标签: apache-spark apache-spark-sql spark-dataframe apache-spark-dataset

我正在评估将现有的RDD代码替换为Dataset。对于我的一个用例,我无法将数据集映射到另一个案例类。

这是我想要做的......

case class MyMap(map: Map[String, String])

case class V1(a: String, b: String){
  def toMyMap: MyMap = {
    MyMap(Map(a->b))
  }

  def toStr: String = {
    a
  }
}

object MyApp extends App {
//Get handle to sqlContext and other useful stuff here.
val df1 = sqlContext.createDataset(Seq(V1("2015-05-01", "data1"), V1("2015-05-01", "data2"))).toDF()
df1.as[V1].map(_.toMyMap).show() //Errors out. Added the exception below.
df1.as[V1].map(_.toStr).show() //Works fine.
}

任何帮助都将不胜感激。

有以下例外:

  

线程中的异常" main" org.apache.spark.SparkException:Job   由于阶段失败而中止:任务不可序列化:   java.io.NotSerializableException:   scala.reflect.runtime.SynchronizedSymbols $ SynchronizedSymbol $$不久$ 1   序列化堆栈:      - 对象不可序列化(类:scala.reflect.runtime.SynchronizedSymbols $ SynchronizedSymbol $$ anon $ 1,   价值:包郎)      - field(class:scala.reflect.internal.Types $ ThisType,name:sym,type:class scala.reflect.internal.Symbols $ Symbol)      - object(类scala.reflect.internal.Types $ UniqueThisType,java.lang.type)      - field(class:scala.reflect.internal.Types $ TypeRef,name:pre,type:class scala.reflect.internal.Types $ Type)      - object(类scala.reflect.internal.Types $ ClassNoArgsTypeRef,String)      - field(class:scala.reflect.internal.Types $ TypeRef,name:normalized,type:class scala.reflect.internal.Types $ Type)      - object(类scala.reflect.internal.Types $ AliasNoArgsTypeRef,String)      - field(class:org.apache.spark.sql.catalyst.ScalaReflection $$ anonfun $ 6,name:   keyType $ 1,类型:class scala.reflect.api.Types $ TypeApi)      - object(类org.apache.spark.sql.catalyst.ScalaReflection $$ anonfun $ 6,)      - field(类:org.apache.spark.sql.catalyst.expressions.MapObjects,name:function,type:interface scala.Function1)      - object(类org.apache.spark.sql.catalyst.expressions.MapObjects,mapobjects(,invoke(upcast(' map,MapType(StringType,StringType,true), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType))      - field(类:org.apache.spark.sql.catalyst.expressions.Invoke,name:targetObject,type:class   org.apache.spark.sql.catalyst.expressions.Expression)      - object(类org.apache.spark.sql.catalyst.expressions.Invoke,invoke(mapobjects(,invoke(upcast(' map,MapType(StringType,StringType,true), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;)))      - writeObject数据(类:scala.collection.immutable.List $ SerializationProxy)      - object(类scala.collection.immutable.List $ SerializationProxy,scala.collection.immutable.List$SerializationProxy@7e78c3cf)      - writeReplace data(class:scala.collection.immutable.List $ SerializationProxy)      - object(类scala.collection.immutable。$冒号$冒号,List(调用(mapobjects(,调用(upcast(' map,MapType(StringType,StringType,true), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;)),   调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;))))      - field(class:org.apache.spark.sql.catalyst.expressions.StaticInvoke,name:   参数,类型:interface scala.collection.Seq)      - object(类org.apache.spark.sql.catalyst.expressions.StaticInvoke,   staticinvoke(类   org.apache.spark.sql.catalyst.util.ArrayBasedMapData $,对象类型(接口   scala.collection.Map),toScalaMap中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;))中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;)),真))      - writeObject数据(类:scala.collection.immutable.List $ SerializationProxy)      - object(类scala.collection.immutable.List $ SerializationProxy,scala.collection.immutable.List$SerializationProxy@377795c5)      - writeReplace data(class:scala.collection.immutable.List $ SerializationProxy)      - object(类scala.collection.immutable。$冒号$冒号,List(staticinvoke(类   org.apache.spark.sql.catalyst.util.ArrayBasedMapData $,对象类型(接口   scala.collection.Map),toScalaMap中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;))中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;)),真)))      - field(class:org.apache.spark.sql.catalyst.expressions.NewInstance,name:   参数,类型:interface scala.collection.Seq)      - object(类org.apache.spark.sql.catalyst.expressions.NewInstance,   newinstance(class collector.MyMap,staticinvoke(class。)   org.apache.spark.sql.catalyst.util.ArrayBasedMapData $,对象类型(接口   scala.collection.Map),toScalaMap中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),keyArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;))中,调用(MapObjects的(中,调用(上溯造型('地图,地图类型(StringType,StringType,真), -   field(类:" scala.collection.immutable.Map",name:" map"), - root   类:   " collector.MyMap&#34),valueArray,数组类型(StringType,真)),StringType),数组,对象类型(类   [Ljava.lang.Object;)),真),假,对象类型(类   collector.MyMap),无))      - field(class:org.apache.spark.sql.catalyst.encoders.ExpressionEncoder,name:   fromRowExpression,输入:class   org.apache.spark.sql.catalyst.expressions.Expression)      - object(类org.apache.spark.sql.catalyst.encoders.ExpressionEncoder,   类[地图#ExprId(9,255a02aa-f2fa-482d-8cd1-63e2d4d08b30):   地图])      - field(类:org.apache.spark.sql.execution.MapPartitions,name:uEncoder,type:class   org.apache.spark.sql.catalyst.encoders.ExpressionEncoder)      - object(类org.apache.spark.sql.execution.MapPartitions,!MapPartitions,类[a [0]:string,b [0]:string],   类[地图#ExprId(9,255a02aa-f2fa-482d-8cd1-63e2d4d08b30):   地图],[地图#13]   + - LocalTableScan [a#2,b#3],[[0,180000000a,2800000005,2d35302d35313032,3130,3161746164],[0,180000000a,2800000005,2d35302d35313032,3130,3261746164]]   )      - field(类:org.apache.spark.sql.execution.MapPartitions $$ anonfun $ 8,name:$ outer,   类型:class org.apache.spark.sql.execution.MapPartitions)      - object(类org.apache.spark.sql.execution.MapPartitions $$ anonfun $ 8,)      - field(class:org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1,name:f $ 22,   type:interface scala.Function1)      - object(类org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1,   )      - field(class:org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21,   name:$ outer,type:class   org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1)      - 对象(类org.apache.spark.rdd.RDD $$ anonfun $ mapPartitionsInternal $ 1 $$ anonfun $ apply $ 21,   )      - field(class:org.apache.spark.rdd.MapPartitionsRDD,name:f,type:interface scala.Function3)      - 对象(在CollectorSparkTest.scala展示的org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [1]类:50)      - field(class:org.apache.spark.NarrowDependency,name: rdd,type:class org.apache.spark.rdd.RDD)      - object(类org.apache.spark.OneToOneDependency,org.apache.spark.OneToOneDependency@110f15b7)      - writeObject数据(类:scala.collection.immutable.List $ SerializationProxy)      - object(类scala.collection.immutable.List $ SerializationProxy,scala.collection.immutable.List$SerializationProxy@6bb23696)      - writeReplace data(class:scala.collection.immutable.List $ SerializationProxy)      - object(类scala.collection.immutable。$冒号$冒号,列表(org.apache.spark.OneToOneDependency@110f15b7))      - field(class:org.apache.spark.rdd.RDD,name:org $ apache $ spark $ rdd $ RDD $$ dependencies ,type:interface   scala.collection.Seq)      - 对象(在CollectorSparkTest.scala展示的org.apache.spark.rdd.MapPartitionsRDD,MapPartitionsRDD [2]类:50)      - field(类:scala.Tuple2,name:_1,type:class java.lang.Object)      - 对象(类scala.Tuple2,(在CollectorSparkTest.scala展示的MapPartitionsRDD [2]:50,))at   org.apache.spark.scheduler.DAGScheduler.org $阿帕奇$火花$ $调度$$ DAGScheduler failJobAndIndependentStages(DAGScheduler.scala:1431)     在   org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.适用(DAGScheduler.scala:1419)     在   org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.适用(DAGScheduler.scala:1418)     在   scala.collection.mutable.ResizableArray $ class.foreach(ResizableArray.scala:59)     在scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)     在   org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)     在   org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:1010)     在   org.apache.spark.scheduler.DAGScheduler.org $阿帕奇$火花$ $调度$$ DAGScheduler submitStage(DAGScheduler.scala:921)     在   org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:861)     在   org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1607)     在   org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)     在   org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)     在org.apache.spark.util.EventLoop $$ anon $ 1.run(EventLoop.scala:48)     在   org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)     在org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)at at   org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)at at   org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)at at   org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:212)     在   org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:165)     在   org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:174)     在   org.apache.spark.sql.DataFrame $$ anonfun $ $组织阿帕奇$火花$ SQL $据帧$$执行$ 1 $ 1.适用(DataFrame.scala:1538)     在   org.apache.spark.sql.DataFrame $$ anonfun $ $组织阿帕奇$火花$ SQL $据帧$$执行$ 1 $ 1.适用(DataFrame.scala:1538)     在   org.apache.spark.sql.execution.SQLExecution $ .withNewExecutionId(SQLExecution.scala:56)     在   org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2125)     在   org.apache.spark.sql.DataFrame.org $阿帕奇$火花$ SQL $据帧$$执行$ 1(DataFrame.scala:1537)     在   org.apache.spark.sql.DataFrame.org $阿帕奇$火花$ SQL $数据框$$收集(DataFrame.scala:1544)     在   org.apache.spark.sql.DataFrame $$ anonfun $头$ ​​1.适用(DataFrame.scala:1414)     在   org.apache.spark.sql.DataFrame $$ anonfun $头$ ​​1.适用(DataFrame.scala:1413)     在org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2138)     在org.apache.spark.sql.DataFrame.head(DataFrame.scala:1413)at   org.apache.spark.sql.DataFrame.take(DataFrame.scala:1495)at   org.apache.spark.sql.DataFrame.showString(DataFrame.scala:171)at at   org.apache.spark.sql.DataFrame.show(DataFrame.scala:394)at   org.apache.spark.sql.Dataset.show(Dataset.scala:228)at   org.apache.spark.sql.Dataset.show(Dataset.scala:192)at   org.apache.spark.sql.Dataset.show(Dataset.scala:200)

2 个答案:

答案 0 :(得分:2)

我认为你可能实际上正在点击SPARK-12696,这已在spark / master中修复。我希望在不久的将来发布1.6.1,其中应该包括这个补丁。

答案 1 :(得分:1)

问题是scala Map类不可序列化,因此Dataset API无法自动生成适当的编码器。我建议将地图转换为字符串,然后解析字符串并转换回地图(假设您在地图中存储字符串)。

数据集API可能也不是最佳选择。我写了this article,这可能是有意义的。