错误检测:org.apache.spark.SparkException:任务无法序列化

时间:2020-06-14 04:59:58

标签: scala apache-spark machine-learning imlib2

我是Scala和Spark的新手。我正在尝试建立机器学习模型。当我尝试使用使用val model = pipeline.fit(trainingData)的训练数据来拟合模型时,抛出错误,指出Task不可序列化。请帮助我解决此问题。请在下面找到相同的代码。

package com.skillassure.spark

import org.apache.spark._
import org.apache.spark.sql._
import org.apache.log4j._
import org.apache.spark.sql.functions._
import org.apache.spark.ml._
import org.apache.spark.ml.feature.Bucketizer
import org.apache.spark.ml.feature.{RegexTokenizer, IDF}
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.evaluation._
import org.apache.spark.ml.tuning._
import scala.reflect.api.materializeTypeTag

object ReviewAnalysis extends java.io.Serializable {


def main(args: Array[String]): Unit =  {

    // Set the log level to only print errors
    Logger.getLogger("org").setLevel(Level.ERROR)
    val spark = SparkSession.builder
                  .appName("SparkSessionExample") 
                  .master("local[*]") 
                  .getOrCreate;



    var file ="src/Resource/review-sample.json";

    val df0  = spark.read.format("json")
     .option("inferSchema", "true")
     .load(file);
    val df = df0.withColumn("reviewTS",df0.col("summary")+ " " + df0.col("reviewText"))
                 .drop("helpful")
                 .drop("reviewerID")
                 .drop("reviewerName")
                 .drop("reviewTime");

    //df.printSchema;

    //df.show(5);
    df.describe("overall").show;

    //
    val df1 = df.filter("overall !=3");
    val bucketizer = new Bucketizer()
                          .setInputCol("overall")
                          .setOutputCol("label")
                          .setSplits(Array(Double.NegativeInfinity, 4.0,
                           Double.PositiveInfinity))
    val df2= bucketizer.transform(df1);
    df2.show(5);
    //val df3 = df2.groupBy("overall","label").count.show;
    //df2.groupBy("label").count.show(5);

    val dff = df2.selectExpr("asin", "cast(overall as string) overall","reviewText","summary","unixReviewTime","cast(reviewTS as string) reviewTS","label");
    dff.show(10);
    val fractions = Map(1.0 -> .1, 0.0 -> 1.0);
    val df3 = dff.stat.sampleBy("label", fractions, 36L);
    df3.show(3);
    //df2.stat.sampleBy("label", fractions, 36L);
    //df3.groupBy("label").count().show;
    df3.printSchema();
    val splitSeed = 5043
    val Array(trainingData, testData) = dff.randomSplit(Array(0.8, 0.2), splitSeed)

    val tokenizer = new RegexTokenizer()
                        .setInputCol("reviewTS")
                        .setOutputCol("reviewTokensUf")
                        .setPattern("\\s+|[,.()\"]");
    val remover = new StopWordsRemover()
                        .setStopWords(StopWordsRemover
                        .loadDefaultStopWords("english"))
                        .setInputCol("reviewTokensUf")
                        .setOutputCol("reviewTokens");


    val cv = new CountVectorizer()
                        .setInputCol("reviewTokens")
                        .setOutputCol("cv")
                        .setVocabSize(200000);

    // list of feature columns
    val idf = new IDF()
                .setInputCol("cv")
                .setOutputCol("features");


    // create Logistic Regression estimator
    // regularizer parameters avoid overfitting
    val lr = new LogisticRegression()
                    .setMaxIter(100)
                    .setRegParam(0.02)
                    .setElasticNetParam(0.3);


    //val steps =  Array( tokenizer, remover, cv, idf,lr);
    val pipeline = new Pipeline().setStages(Array( tokenizer, remover, cv, idf,lr)) 
    //val pipeline = new Pipeline().setStages(steps);

    val model = pipeline.fit(trainingData)



}

}

请找到完整的错误:

    20/06/14 06:08:51 ERROR Instrumentation: org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:396)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:386)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2379)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$1(RDD.scala:886)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
    at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:885)
    at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
    at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169)
    at org.apache.spark.sql.execution.DeserializeToObjectExec.doExecute(objects.scala:96)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
    at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169)
    at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:110)
    at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:109)
    at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:3073)
    at org.apache.spark.sql.Dataset.rdd(Dataset.scala:3071)
    at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:191)
    at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:149)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$5(Pipeline.scala:155)
    at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:132)
    at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:125)
    at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$4(Pipeline.scala:155)
    at scala.collection.Iterator.foreach(Iterator.scala:929)
    at scala.collection.Iterator.foreach$(Iterator.scala:929)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1417)
    at scala.collection.IterableViewLike$Transformed.foreach(IterableViewLike.scala:44)
    at scala.collection.IterableViewLike$Transformed.foreach$(IterableViewLike.scala:44)
    at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$2(Pipeline.scala:151)
    at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:132)
    at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:125)
    at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$1(Pipeline.scala:137)
    at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
    at scala.util.Try$.apply(Try.scala:209)
    at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
    at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:137)
    at com.skillassure.spark.ReviewAnalysis$.main(ReviewAnalysis.scala:111)
    at com.skillassure.spark.ReviewAnalysis.main(ReviewAnalysis.scala)
Caused by: java.io.NotSerializableException: scala.runtime.LazyRef
Serialization stack:
    - object not serializable (class: scala.runtime.LazyRef, value: LazyRef thunk)
    - element of array (index: 2)
    - array (class [Ljava.lang.Object;, size 3)
    - field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
    - object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.catalyst.expressions.ScalaUDF, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/catalyst/expressions/ScalaUDF.$anonfun$f$2:(Lscala/Function1;Lorg/apache/spark/sql/catalyst/expressions/Expression;Lscala/runtime/LazyRef;Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, instantiatedMethodType=(Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, numCaptured=3])
    - writeReplace data (class: java.lang.invoke.SerializedLambda)
    - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$2506/239902985, org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$2506/239902985@3f64d943)
    - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, name: f, type: interface scala.Function1)
    - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, bucketizer_0(knownnotnull(overall#9)))
    - field (class: org.apache.spark.sql.catalyst.expressions.If, name: falseValue, type: class org.apache.spark.sql.catalyst.expressions.Expression)
    - object (class org.apache.spark.sql.catalyst.expressions.If, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)))
    - field (class: org.apache.spark.sql.catalyst.expressions.Alias, name: child, type: class org.apache.spark.sql.catalyst.expressions.Expression)
    - object (class org.apache.spark.sql.catalyst.expressions.Alias, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133)
    - element of array (index: 5)
    - array (class [Ljava.lang.Object;, size 6)
    - field (class: scala.collection.mutable.ArrayBuffer, name: array, type: class [Ljava.lang.Object;)
    - object (class scala.collection.mutable.ArrayBuffer, ArrayBuffer(asin#7, cast(overall#9 as string) AS overall#187, reviewText#10, summary#14, unixReviewTime#15L, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133))
    - field (class: org.apache.spark.sql.execution.ProjectExec, name: projectList, type: interface scala.collection.Seq)
    - object (class org.apache.spark.sql.execution.ProjectExec, Project [asin#7, cast(overall#9 as string) AS overall#187, reviewText#10, summary#14, unixReviewTime#15L, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133]
+- Filter (isnotnull(overall#9) AND NOT (overall#9 = 3.0))
   +- BatchScan[asin#7, overall#9, reviewText#10, summary#14, unixReviewTime#15L] JsonScan Location: InMemoryFileIndex[file:/C:/SparkScala/SparkLearning/src/Resource/review-sample.json], ReadSchema: struct<asin:string,overall:double,reviewText:string,summary:string,unixReviewTime:bigint>
)
    - field (class: org.apache.spark.sql.execution.SortExec, name: child, type: class org.apache.spark.sql.execution.SparkPlan)
    - object (class org.apache.spark.sql.execution.SortExec, Sort [asin#7 ASC NULLS FIRST, overall#187 ASC NULLS FIRST, reviewText#10 ASC NULLS FIRST, summary#14 ASC NULLS FIRST, unixReviewTime#15L ASC NULLS FIRST, label#133 ASC NULLS FIRST], false, 0
+- Project [asin#7, cast(overall#9 as string) AS overall#187, reviewText#10, summary#14, unixReviewTime#15L, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133]
   +- Filter (isnotnull(overall#9) AND NOT (overall#9 = 3.0))
      +- BatchScan[asin#7, overall#9, reviewText#10, summary#14, unixReviewTime#15L] JsonScan Location: InMemoryFileIndex[file:/C:/SparkScala/SparkLearning/src/Resource/review-sample.json], ReadSchema: struct<asin:string,overall:double,reviewText:string,summary:string,unixReviewTime:bigint>
)
    - element of array (index: 0)
    - array (class [Ljava.lang.Object;, size 15)
    - element of array (index: 1)
    - array (class [Ljava.lang.Object;, size 3)
    - field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
    - object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.execution.WholeStageCodegenExec, functionalInterfaceMethod=scala/Function2.apply:(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/execution/WholeStageCodegenExec.$anonfun$doExecute$4$adapted:(Lorg/apache/spark/sql/catalyst/expressions/codegen/CodeAndComment;[Ljava/lang/Object;Lorg/apache/spark/sql/execution/metric/SQLMetric;Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, instantiatedMethodType=(Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, numCaptured=3])
    - writeReplace data (class: java.lang.invoke.SerializedLambda)
    - object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$2181/2096690266, org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$2181/2096690266@79d14037)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:393)
    ... 48 more

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:396)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:386)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2379)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndex$1(RDD.scala:886)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
    at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:885)
    at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
    at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169)
    at org.apache.spark.sql.execution.DeserializeToObjectExec.doExecute(objects.scala:96)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:173)
    at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:211)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:208)
    at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:169)
    at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:110)
    at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:109)
    at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:3073)
    at org.apache.spark.sql.Dataset.rdd(Dataset.scala:3071)
    at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:191)
    at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:149)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$5(Pipeline.scala:155)
    at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:132)
    at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:125)
    at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$4(Pipeline.scala:155)
    at scala.collection.Iterator.foreach(Iterator.scala:929)
    at scala.collection.Iterator.foreach$(Iterator.scala:929)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1417)
    at scala.collection.IterableViewLike$Transformed.foreach(IterableViewLike.scala:44)
    at scala.collection.IterableViewLike$Transformed.foreach$(IterableViewLike.scala:44)
    at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$2(Pipeline.scala:151)
    at org.apache.spark.ml.MLEvents.withFitEvent(events.scala:132)
    at org.apache.spark.ml.MLEvents.withFitEvent$(events.scala:125)
    at org.apache.spark.ml.util.Instrumentation.withFitEvent(Instrumentation.scala:42)
    at org.apache.spark.ml.Pipeline.$anonfun$fit$1(Pipeline.scala:137)
    at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
    at scala.util.Try$.apply(Try.scala:209)
    at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
    at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:137)
    at com.skillassure.spark.ReviewAnalysis$.main(ReviewAnalysis.scala:111)
    at com.skillassure.spark.ReviewAnalysis.main(ReviewAnalysis.scala)
Caused by: java.io.NotSerializableException: scala.runtime.LazyRef
Serialization stack:
    - object not serializable (class: scala.runtime.LazyRef, value: LazyRef thunk)
    - element of array (index: 2)
    - array (class [Ljava.lang.Object;, size 3)
    - field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
    - object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.catalyst.expressions.ScalaUDF, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/catalyst/expressions/ScalaUDF.$anonfun$f$2:(Lscala/Function1;Lorg/apache/spark/sql/catalyst/expressions/Expression;Lscala/runtime/LazyRef;Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, instantiatedMethodType=(Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, numCaptured=3])
    - writeReplace data (class: java.lang.invoke.SerializedLambda)
    - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$2506/239902985, org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$2506/239902985@3f64d943)
    - field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, name: f, type: interface scala.Function1)
    - object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, bucketizer_0(knownnotnull(overall#9)))
    - field (class: org.apache.spark.sql.catalyst.expressions.If, name: falseValue, type: class org.apache.spark.sql.catalyst.expressions.Expression)
    - object (class org.apache.spark.sql.catalyst.expressions.If, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)))
    - field (class: org.apache.spark.sql.catalyst.expressions.Alias, name: child, type: class org.apache.spark.sql.catalyst.expressions.Expression)
    - object (class org.apache.spark.sql.catalyst.expressions.Alias, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133)
    - element of array (index: 5)
    - array (class [Ljava.lang.Object;, size 6)
    - field (class: scala.collection.mutable.ArrayBuffer, name: array, type: class [Ljava.lang.Object;)
    - object (class scala.collection.mutable.ArrayBuffer, ArrayBuffer(asin#7, cast(overall#9 as string) AS overall#187, reviewText#10, summary#14, unixReviewTime#15L, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133))
    - field (class: org.apache.spark.sql.execution.ProjectExec, name: projectList, type: interface scala.collection.Seq)
    - object (class org.apache.spark.sql.execution.ProjectExec, Project [asin#7, cast(overall#9 as string) AS overall#187, reviewText#10, summary#14, unixReviewTime#15L, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133]
+- Filter (isnotnull(overall#9) AND NOT (overall#9 = 3.0))
   +- BatchScan[asin#7, overall#9, reviewText#10, summary#14, unixReviewTime#15L] JsonScan Location: InMemoryFileIndex[file:/C:/SparkScala/SparkLearning/src/Resource/review-sample.json], ReadSchema: struct<asin:string,overall:double,reviewText:string,summary:string,unixReviewTime:bigint>
)
    - field (class: org.apache.spark.sql.execution.SortExec, name: child, type: class org.apache.spark.sql.execution.SparkPlan)
    - object (class org.apache.spark.sql.execution.SortExec, Sort [asin#7 ASC NULLS FIRST, overall#187 ASC NULLS FIRST, reviewText#10 ASC NULLS FIRST, summary#14 ASC NULLS FIRST, unixReviewTime#15L ASC NULLS FIRST, label#133 ASC NULLS FIRST], false, 0
+- Project [asin#7, cast(overall#9 as string) AS overall#187, reviewText#10, summary#14, unixReviewTime#15L, if (isnull(overall#9)) null else bucketizer_0(knownnotnull(overall#9)) AS label#133]
   +- Filter (isnotnull(overall#9) AND NOT (overall#9 = 3.0))
      +- BatchScan[asin#7, overall#9, reviewText#10, summary#14, unixReviewTime#15L] JsonScan Location: InMemoryFileIndex[file:/C:/SparkScala/SparkLearning/src/Resource/review-sample.json], ReadSchema: struct<asin:string,overall:double,reviewText:string,summary:string,unixReviewTime:bigint>
)
    - element of array (index: 0)
    - array (class [Ljava.lang.Object;, size 15)
    - element of array (index: 1)
    - array (class [Ljava.lang.Object;, size 3)
    - field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
    - object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.execution.WholeStageCodegenExec, functionalInterfaceMethod=scala/Function2.apply:(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/execution/WholeStageCodegenExec.$anonfun$doExecute$4$adapted:(Lorg/apache/spark/sql/catalyst/expressions/codegen/CodeAndComment;[Ljava/lang/Object;Lorg/apache/spark/sql/execution/metric/SQLMetric;Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, instantiatedMethodType=(Ljava/lang/Object;Lscala/collection/Iterator;)Lscala/collection/Iterator;, numCaptured=3])
    - writeReplace data (class: java.lang.invoke.SerializedLambda)
    - object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$2181/2096690266, org.apache.spark.sql.execution.WholeStageCodegenExec$$Lambda$2181/2096690266@79d14037)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:41)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:393)
    ... 48 more

1 个答案:

答案 0 :(得分:0)

问题代码不在描述中提到的对象ReviewAnalysis中。请尝试通过在项目中的以下类中进行注释/删除来对此进行编译-

com.spark.programming.foodreview$.main(foodreview.scala:154)
at com.spark.programming.foodreview.main(foodreview.scala)