使用standardscaler时SparkException:任务不可序列化

时间:2018-03-29 01:47:07

标签: scala apache-spark

我是spark和scala的新手,并测试了spark-shell中的代码。但是,我遇到了关于可序列化问题的猜测(我的代码的描述在底部):

scala> import org.apache.spark.SparkContext._
import org.apache.spark.SparkContext._

scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext

scala> val hiveCtx = new org.apache.spark.sql.hive.HiveContext(sc)
18/03/29 09:16:06 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
18/03/29 09:16:07 WARN SessionState: load mapred-default.xml, HIVE_CONF_DIR env not found!
hiveCtx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@2e5affb3

scala> val mobile_features = hiveCtx.sql("SELECT velocity_arith_avg,x_velocity,total_distance,ratio_distance,record_num,std_neighbor_angle,std_total_angle,std_abs_neighbor_angle,std_abs_total_angle,total_wait_time FROM yx_loc.tmp_junwang_mobile_features")
mobile_features: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double]

scala> val walk_features = hiveCtx.sql("SELECT velocity_arith_avg,x_velocity,total_distance,ratio_distance,record_num,std_neighbor_angle,std_total_angle,std_abs_neighbor_angle,std_abs_total_angle,total_wait_time FROM yx_loc.tmp_junwang_walk_features")
walk_features: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double]

scala> val train_features = hiveCtx.sql("SELECT velocity_arith_avg,x_velocity,total_distance,ratio_distance,record_num,std_neighbor_angle,std_total_angle,std_abs_neighbor_angle,std_abs_total_angle,total_wait_time FROM yx_loc.tmp_junwang_train_features")
train_features: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double]

scala> import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.lit

scala> val df_mobile = mobile_features.withColumn("label", lit(2.0))
df_mobile: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]

scala> val df_walk = walk_features.withColumn("label", lit(0.0))
df_walk: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]

scala> val df_train = train_features.withColumn("label", lit(1.0))
df_train: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]

scala> val df1 = df_mobile.unionAll(df_walk)
df1: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]

scala> val df = df1.unionAll(df_train)
df: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]

scala> val tmp_df = df.cache()
18/03/29 09:16:22 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
tmp_df: df.type = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double]

scala> import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.VectorAssembler

scala> import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vectors

scala> val assembler = new VectorAssembler().setInputCols(Array("velocity_arith_avg","x_velocity","total_distance","ratio_distance","record_num","std_neighbor_angle","std_total_angle","std_abs_neighbor_angle","std_abs_total_angle","total_wait_time")).setOutputCol("features")
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_1f84caf86e52

scala> val output = assembler.transform(tmp_df)
output: org.apache.spark.sql.DataFrame = [velocity_arith_avg: double, x_velocity: double, total_distance: double, ratio_distance: double, record_num: bigint, std_neighbor_angle: double, std_total_angle: double, std_abs_neighbor_angle: double, std_abs_total_angle: double, total_wait_time: double, label: double, features: vector]

scala> import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LabeledPoint

scala> val data_rdd = output.rdd.map(row=>LabeledPoint(row.getAs[Double]("label"), row.getAs[org.apache.spark.mllib.linalg.Vector]("features")))
data_rdd: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[19] at map at <console>:55

scala> import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.util.MLUtils

scala> import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.feature.StandardScaler

scala> val scaler= new StandardScaler(withMean=true, withStd=true).fit(data_rdd.map(x=>x.features))
scaler: org.apache.spark.mllib.feature.StandardScalerModel = org.apache.spark.mllib.feature.StandardScalerModel@10eb9604

scala> val data_scaled =data_rdd.map(x=>(x.label, scaler.transform(Vectors.dense(x.features.toArray))))
org.apache.spark.SparkException: Task not serializable
        at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
        at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
        at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
        at org.apache.spark.SparkContext.clean(SparkContext.scala:2109)
        at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:352)
        at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:351)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:344)
        at org.apache.spark.rdd.RDD.map(RDD.scala:351)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:61)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:66)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:68)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:70)
        at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:72)
        at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:74)
        at $iwC$$iwC$$iwC$$iwC.<init>(<console>:76)
        at $iwC$$iwC$$iwC.<init>(<console>:78)
        at $iwC$$iwC.<init>(<console>:80)
        at $iwC.<init>(<console>:82)
        at <init>(<console>:84)
        at .<init>(<console>:88)
        at .<clinit>(<console>)
        at .<init>(<console>:7)
        at .<clinit>(<console>)
        at $print(<console>)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
        at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1340)
        at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
        at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
        at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
        at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
        at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
        at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
        at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
        at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
        at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
        at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
        at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
        at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
        at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
        at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
        at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059)
        at org.apache.spark.repl.Main$.main(Main.scala:31)
        at org.apache.spark.repl.Main.main(Main.scala)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:766)
        at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:183)
        at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:208)
        at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:123)
        at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: org.apache.spark.sql.CarbonEnv
Serialization stack:
        - object not serializable (class: org.apache.spark.sql.CarbonEnv, value: org.apache.spark.sql.CarbonEnv@98cc7c3)
        - writeObject data (class: scala.collection.mutable.HashMap)
        - object (class scala.collection.mutable.HashMap, Map(org.apache.spark.sql.CarbonEnv -> org.apache.spark.sql.CarbonEnv@98cc7c3, org.apache.spark.sql.hbase.HBaseEnv -> org.apache.spark.sql.hbase.HBaseEnv@7fd5109))
        - field (class: org.apache.spark.sql.SQLContext, name: registeredEnv, type: class scala.collection.mutable.HashMap)
        - object (class org.apache.spark.sql.hive.HiveContext, org.apache.spark.sql.hive.HiveContext@2e5affb3)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: hiveCtx, type: class org.apache.spark.sql.hive.HiveContext)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@1496a4ca)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@1cd7de2)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@59108fbf)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@57ae2f11)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC@482741af)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC@54f10c5c)
        - field (class: $iwC$$iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC@14eb6c42)
        - field (class: $iwC$$iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC, $iwC$$iwC$$iwC@5be38ef2)
        - field (class: $iwC$$iwC, name: $iw, type: class $iwC$$iwC$$iwC)
        - object (class $iwC$$iwC, $iwC$$iwC@2c895dff)
        - field (class: $iwC, name: $iw, type: class $iwC$$iwC)
        - object (class $iwC, $iwC@412077c9)
        - field (class: $line35.$read, name: $iw, type: class $iwC)
        - object (class $line35.$read, $line35.$read@7ca9f111)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $VAL136, type: class $line35.$read)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@3bdb5380)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, name: $outer, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC, $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC@7ee8882a)
        - field (class: $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1, name: $outer, type: class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC)
        - object (class $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1, <function1>)
        at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
        at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
        at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
        at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
        ... 57 more


scala> 

该过程基本上是从Hive中提取三个表来生成sql.Dataframe。然后,我将这三个表与一个名为“label”的新列连接起来。

之后,由于我想在运行模型之前进行标准缩放,因此我添加了一个名为features的新列,以便将除label之外的所有列合并在一起{{1}它有效。通过这种方式,我可以为将来的建模构建VectorAssembler

因此,我构建了一个LabeledPoint后来命名为LabeledPoint并使用标准缩放器获取具有此data_rdd的缩放器。但是,当我想用​​这个缩放器转换它时,它会导致LabeledPoint

我不知道原因,所以希望有人可以帮助我。谢谢!

顺便说一下,这里还有一些其他的信息:

  1. 我的代码中有一行SparkException: Task not serializable如下:

    res3:Array [org.apache.spark.sql.Row] = Array([57.23282136599527,113.75,6.720267099897455,2.0439267018617353,12,107.46722210569848,71.99753613533004,49.37444375543352,141.0,2.0])

  2. 除了作为标签的最后一个元素,所有其他元素都是数据集的特征,我将它们合并到VectorAssembler的一个名为df的新列中,如上所述。

    1. 火花的版本是1.5.1,所以很老:(

0 个答案:

没有答案