“main”org.apache.spark.SparkException:任务不可序列化

时间:2017-07-07 02:33:52

标签: java mongodb apache-spark serialization

我正在尝试运行以下简单的Spark代码:

package com.bdg.try.graph_api;
import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.config.WriteConfig;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import static java.util.Arrays.asList;
@SpringBootApplication
public class SparkStreamingApplication {

private final Logger logger = LoggerFactory.getLogger(SparkStreamingApplication.class);

public static void main(String[] args) throws Exception {
    SpringApplication.run(SparkStreamingApplication.class, args);
    SparkStreamingApplication main = new SparkStreamingApplication();
    main.run();
}
private void run(){
SparkSession spark = SparkSession
        .builder()
        .master("spark://192.168.xx.xx:7077")
        .config("spark.mongodb.input.uri", "mongodb://192.168.xx.xx:27017/database.test_spark")
        .config("spark.database.output.uri", "mongodb://192.168.xx.xx:27017/database.test_spark")
        .config("spark.driver.allowMultipleContexts", "true")
        .appName("SparkTest")
        .getOrCreate();

JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

// Create a custom WriteConfig
Map<String, String> writeOverrides = new HashMap<String, String>();
writeOverrides.put("collection", "spark");
writeOverrides.put("writeConcern.w", "majority");
WriteConfig writeConfig = WriteConfig.create(jsc).withOptions(writeOverrides);

// Create a RDD of 10 documents
JavaRDD<Document> sparkDocuments = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)).map
    (new Function<Integer, Document>() {
        public Document call(final Integer i) throws Exception {
            return Document.parse("{spark: " + i + "}");
        }
    });

System.out.println("collection : "+writeOverrides);
MongoSpark.save(sparkDocuments, writeConfig);
spark.stop();
}
}

此处记录异常:

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:370)
at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:369)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.map(RDD.scala:369)
at org.apache.spark.api.java.JavaRDDLike$class.map(JavaRDDLike.scala:93)
at org.apache.spark.api.java.AbstractJavaRDDLike.map(JavaRDDLike.scala:45)
at com.bdg.ebdesk.graph_api.GraphApiApplication.run(GraphApiApplication.java:60)
at com.bdg.ebdesk.graph_api.GraphApiApplication.main(GraphApiApplication.java:37)
Caused by: java.io.NotSerializableException: com.bdg.try.graph_api.GraphApiApplication

[UPDATE] 接下来,我添加implements java.io.Serializable,但上面的错误日志:

2017-07-07 09:27:19.238 ERROR 4369 --- [ffle-server-3-1] o.a.s.n.server.TransportRequestHandler   : Error while invoking RpcHandler#receive() on RPC id 7048300146537589013
java.lang.ClassNotFoundException: org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages$RetrieveSparkProps$
at java.net.URLClassLoader.findClass(URLClassLoader.java:381) ~[na:1.8.0_131]
at java.lang.ClassLoader.loadClass(ClassLoader.java:424) ~[na:1.8.0_131]
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335) ~[na:1.8.0_131]
at java.lang.ClassLoader.loadClass(ClassLoader.java:357) ~[na:1.8.0_131]
at java.lang.Class.forName0(Native Method) ~[na:1.8.0_131]
at java.lang.Class.forName(Class.java:348) ~[na:1.8.0_131]
at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67) ~[spark-core_2.11-2.1.0.jar:2.1.0]
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535) ~[na:1.8.0_131]
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535) ~[na:1.8.0_131]
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422) ~[na:1.8.0_131]

这里'jsc'是我正在使用的JavaSparkContext对象。据我所知,JavaSparkContext不是Serializable对象,不应该在任何将被发送给Spark工作者的函数中使用它。

现在,我无法理解的是,JavaSparkContext的实例是如何发送给工作人员的?我应该在代码中更改什么以避免这种情况?

0 个答案:

没有答案