"主" org.apache.spark.SparkException:任务不可序列化

时间:2017-07-05 08:00:13

标签: java multithreading mongodb apache-spark serialization

我试图运行以下简单的Spark代码:

public static void main(final String[] args)throws Exception {

    ClassLoader.getSystemClassLoader().getClass().getName();
    Thread.currentThread().getContextClassLoader().getClass().getName();
    GraphApiApplication.class.getClassLoader();

    /*if (args.length < 1) {
        System.err.println("try again");
        System.exit(1);
    }*/

    GraphApiApplication main = new GraphApiApplication();
    main.run();
}

private void run(){
    SparkSession spark = SparkSession
            .builder()
            .master("spark://192.168.xx.xx:7077")
            .config("spark.mongodb.input.uri", "mongodb://192.168.xx.xx:27017/database.test_spark")
            .config("spark.database.output.uri", "mongodb://192.168.xx.xx:27017/database.test_spark")
            .config("spark.driver.allowMultipleContexts", "true")
            .appName("SparkTest")
            .getOrCreate();

    JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

    // Create a custom WriteConfig
    Map<String, String> writeOverrides = new HashMap<String, String>();
    writeOverrides.put("collection", "spark");
    writeOverrides.put("writeConcern.w", "majority");
    WriteConfig writeConfig = WriteConfig.create(jsc).withOptions(writeOverrides);

    // Create a RDD of 10 documents
    JavaRDD<Document> sparkDocuments = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)).map
        (new Function<Integer, Document>() {
            public Document call(final Integer i) throws Exception {
                return Document.parse("{spark: " + i + "}");
            }
        });

    System.out.println("collection : "+writeOverrides);
    MongoSpark.save(sparkDocuments, writeConfig);
    spark.stop();
}

此处记录异常:

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
    at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:370)
    at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:369)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.RDD.map(RDD.scala:369)
    at org.apache.spark.api.java.JavaRDDLike$class.map(JavaRDDLike.scala:93)
    at org.apache.spark.api.java.AbstractJavaRDDLike.map(JavaRDDLike.scala:45)
    at com.bdg.ebdesk.graph_api.GraphApiApplication.run(GraphApiApplication.java:60)
    at com.bdg.ebdesk.graph_api.GraphApiApplication.main(GraphApiApplication.java:37)
Caused by: java.io.NotSerializableException: com.bdg.ebdesk.graph_api.GraphApiApplication
Serialization stack:
    - object not serializable (class: com.bdg.ebdesk.graph_api.GraphApiApplication, value: com.bdg.ebdesk.graph_api.GraphApiApplication@7e0aadd0)
    - field (class: com.bdg.ebdesk.graph_api.GraphApiApplication$1, name: this$0, type: class com.bdg.ebdesk.graph_api.GraphApiApplication)
    - object (class com.bdg.ebdesk.graph_api.GraphApiApplication$1, com.bdg.ebdesk.graph_api.GraphApiApplication$1@62d363ab)
    - field (class: org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1, name: fun$1, type: interface org.apache.spark.api.java.function.Function)
    - object (class org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
    ... 13more

这里&#39; jsc&#39;是我正在使用的JavaSparkContext对象。据我所知,JavaSparkContext不是Serializable对象,不应该在任何将被发送给Spark工作者的函数中使用它。

现在,我无法理解的是,JavaSparkContext的实例是如何发送给工作人员的?我应该在代码中更改什么以避免这种情况?

1 个答案:

答案 0 :(得分:1)

当您编写名为inner class或lambda的匿名内部类时,Java会创建对内部类中外部类的引用。

因此,即使内部类是可序列化的,也可能发生异常,外部类也必须是可序列化的。

implements Serializable添加到您的GraphApiApplication类中,因为如果在map函数中作为参数写入的匿名内部类的外部类