我目前正在尝试使用spark-shell来测试决策树模型,并得到关于序列化的一个sparkexception,代码和错误警告在下面,我将把我的代码的描述放在底部:< / p>
data Game = Game {
_players :: [Player]
}
data Player = Player {
_cards :: [Card]
}
data Card = Card {
_status :: CardStatus
}
data CardStatus = Face | Back
在这些代码中,我从Hive中提取了三个表,并将它们与一个名为makeLenses
的新列连接在一起。之后,新的scala> import org.apache.spark.SparkContext._
import org.apache.spark.SparkContext._
scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext
scala> import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.lit
scala> import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.VectorAssembler
scala> import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vectors
scala> import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StandardScaler
scala> import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.DecisionTree
scala> import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.tree.model.DecisionTreeModel
scala> import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.util.MLUtils
scala> import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LabeledPoint
scala> val hiveCtx = new org.apache.spark.sql.hive.HiveContext(sc)
18/03/29 15:18:23 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
18/03/29 15:18:23 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
hiveCtx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@3c2b7322
scala> val mobile_features = hiveCtx.sql("SELECT velocity_arith_avg,x_velocity,total_distance,ratio_distance,record_num,std_neighbor_angle,std_total_angle,std_abs_neighbor_angle,std_abs_total_angle,total_wait_time FROM yx_loc.tmp_junwang_mobile_features")
mobile_features: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double]
scala> val walk_features = hiveCtx.sql("SELECT velocity_arith_avg,x_velocity,total_distance,ratio_distance,record_num,std_neighbor_angle,std_total_angle,std_abs_neighbor_angle,std_abs_total_angle,total_wait_time FROM yx_loc.tmp_junwang_walk_features")
walk_features: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double]
scala> val train_features = hiveCtx.sql("SELECT velocity_arith_avg,x_velocity,total_distance,ratio_distance,record_num,std_neighbor_angle,std_total_angle,std_abs_neighbor_angle,std_abs_total_angle,total_wait_time FROM yx_loc.tmp_junwang_train_features")
train_features: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double]
scala> val df_mobile = mobile_features.withColumn("label", lit(2.0))
df_mobile: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]
scala> val df_walk = walk_features.withColumn("label", lit(0.0))
df_walk: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]
scala> val df_train = train_features.withColumn("label", lit(1.0))
df_train: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]
scala> val df1 = df_mobile.unionAll(df_walk)
df1: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]
scala> val df = df1.unionAll(df_train)
df: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]
scala> val tmp_df = df.cache()
18/03/29 15:18:27 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
tmp_df: df.type = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]
scala> val assembler = new VectorAssembler().setInputCols(Array("velocity_arith_avg","x_velocity","total_distance","ratio_distance","record_num","std_neighbor_angle","std_total_angle","std_abs_neighbor_angle","std_abs_total_angle","total_wait_time")).setOutputCol("features")
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_6c704c649b5a
scala> val output = assembler.transform(tmp_df)
output: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double, features: vector]
scala> val scaler= new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(true).setWithMean(false)
scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_0853b7f7dff4
scala> val scalerModel = scaler.fit(output)
scalerModel: org.apache.spark.ml.feature.StandardScalerModel = stdScal_0853b7f7dff4
scala> val scaledData = scalerModel.transform(output)
scaledData: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double, features: vector, scaledFeatures: vector]
scala>
scala> val data_rdd = scaledData.rdd.map(row=>LabeledPoint(row.getAs[Double]("label"), row.getAs[org.apache.spark.mllib.linalg.Vector]("scaledFeatures")))
data_rdd: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[22] at map at <console>:63
scala> val numClasses = 3
numClasses: Int = 3
scala> val impurity = "entropy"
impurity: String = entropy
scala> val maxDepth = 10
maxDepth: Int = 10
scala> val minInstancedPerNode = 10
minInstancedPerNode: Int = 10
scala> val categoricalFeaturesInfo = Map[Int, Int]()
categoricalFeaturesInfo: scala.collection.immutable.Map[Int,Int] = Map()
scala> val maxBins = 32
maxBins: Int = 32
scala> val model = DecisionTree.trainClassifier(data_rdd, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins)
model: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 10 with 1591 nodes
scala> val labelAndPreds = data_rdd.map { row =>
| val prediction = model.predict(row.features)
| (row.label, prediction)
| }
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2109)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:352)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:351)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:344)
at org.apache.spark.rdd.RDD.map(RDD.scala:351)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:77)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:85)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:87)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:89)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:91)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:93)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:95)
at $iwC$$iwC$$iwC.<init>(<console>:97)
at $iwC$$iwC.<init>(<console>:99)
at $iwC.<init>(<console>:101)
at <init>(<console>:103)
at .<init>(<console>:107)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1340)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:875)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:875)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:875)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:766)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:183)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:208)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:123)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: org.apache.spark.sql.CarbonEnv
Serialization stack:
- object not serializable (class: org.apache.spark.sql.CarbonEnv, value: org.apache.spark.sql.CarbonEnv@ff28a30)
- writeObject data (class: scala.collection.mutable.HashMap)
- object (class scala.collection.mutable.HashMap, Map(org.apache.spark.sql.CarbonEnv -> org.apache.spark.sql.CarbonEnv@ff28a30, org.apache.spark.sql.hbase.HBaseEnv -> org.apache.spark.sql.hbase.HBaseEnv@2933f654))
- field (class: org.apache.spark.sql.SQLContext, name: registeredEnv, type: class scala.collection.mutable.HashMap)
- object (class org.apache.spark.sql.hive.HiveContext, org.apache.spark.sql.hive.HiveContext@3c2b7322)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: hiveCtx, type: class org.apache.spark.sql.hive.HiveContext)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@7404ea32)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@6ec5e204)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@6ca9dba0)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@49d75bb0)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC@31c00fe7)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC@27dc80ff)
- field (class: $iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC@63c512c6)
- field (class: $iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC, $iwC$$iwC$$iwC@36d49430)
- field (class: $iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC)
- object (class $iwC$$iwC, $iwC$$iwC@4eb0f3ed)
- field (class: $iwC, name: $iw, type: class $iwC$$iwC)
- object (class $iwC, $iwC@5c26544b)
- field (class: $line31.$read, name: $iw, type: class $iwC)
- object (class $line31.$read, $line31.$read@7569a46b)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $VAL168, type: class $line31.$read)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@50038523)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $outer, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@2a105ba6)
- field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1, name: $outer, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
- object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 63 more
包含大约15列作为要素,1列作为标签。
然后,我使用label
将所有功能合并到一个名为sql.DataFrame
的新列中。之后,我对从该数据帧获得的labeledpoint矢量进行了标准缩放,得到了一个名为VectorAssembler
的新数据帧,其中有一个名为features
的新列。然后我使用标签和缩放的特征从scaledData
数据帧生成带标记点的RDD向量。
最后,将此RDD向量输入决策树模型,并且shell提示表明模型已成功训练。但是当我决定使用这个RDD向量和训练模型来生成带有scaledFeatures
函数的预测标签时,它失败了SparkException有关序列化。
因此,我想知道有人可以提供一些有关它的建议,也可以解释我的代码失败的原因。我是spark和scala的新手,所以在阅读相关文档后我仍然对序列化方法感到困惑。感谢。
其他信息: 这里火花的版本是1.5.1。
非常感谢你!
答案 0 :(得分:0)
异常的原因应该是匿名函数具有无法序列化的对象。因为在火花运行任务之前,它会检查用户定义的功能是否可以序列化,因此可以通过网络传递