任务不可序列化错误 - Spark Java

时间:2017-03-30 18:04:52

标签: java csv apache-spark serialization

我的Spark应用程序中有以下代码。它应该从set csv文件中过滤掉基因。我正在将csv文件加载到spark RDD中。当我使用spark-submit运行jar时,我得到Task不可序列化的异常。

public class AttributeSelector {

    public static final String path = System.getProperty("user.dir") + File.separator;
    public static Queue<Instances> result = new LinkedBlockingQueue<>();
    private static final Logger LOGGER = LoggerFactory.getLogger(AttributeSelector.class);

    int[] selectAttributes(Instances data) {

        int[] indexes = null;
        AttributeSelection filter = new AttributeSelection();
        CfsSubsetEval evaluator = new CfsSubsetEval();
        filter.setEvaluator(evaluator);
        BestFirst search = new BestFirst();
        filter.setSearch(search);
        try {
            filter.SelectAttributes(data);
            indexes = filter.selectedAttributes();
        } catch (Exception e) {
            System.out.println("Error when resampling input data with selected attributes!");
            e.printStackTrace();
        }
        return indexes;

    }

    public void selectData(Instances data, int[] indexes) {

        Instances newData = data;
        Remove remove = new Remove();
        remove.setAttributeIndicesArray(indexes);
        remove.setInvertSelection(true);

        try {
            remove.setInputFormat(data);
            newData = Filter.useFilter(data, remove);
            result.add(newData);
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    private Instances getInputInstance(File fileName) {
        CSVLoader loader = new CSVLoader();
        Instances instance = null;
        try {
            loader.setSource(fileName);
            instance = loader.getDataSet();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return instance;
    }

    private void writeMergedOutput() {

        LOGGER.info("Started merging results");
        Instances finalResult = result.poll();

        while (!result.isEmpty()) {
            finalResult = Instances.mergeInstances(finalResult, result.poll());
        }

        try {
            BufferedWriter writer = new BufferedWriter(new FileWriter(path + "Output" + ".arff"));
            writer.write(finalResult.toString());
            writer.flush();
            writer.close();

        } catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("Finished merging results");
    }

    public static void main(String[] args) {
        long start = System.currentTimeMillis();
        try {
            LOGGER.info("Loading data");
            AttributeSelector attributeSelector = new AttributeSelector();
            attributeSelector.run(path + "Parts");

        } catch (Exception e) {
            e.printStackTrace();
        }
        long end = System.currentTimeMillis();
        LOGGER.info("Execution time: " + (end - start));
    }

    public void run(String sourceDir) {
        String master = "local[*]";

        SparkConf conf = new SparkConf()
                .setAppName(AttributeSelector.class.getName())
                .setMaster(master);

        JavaSparkContext context = new JavaSparkContext(conf);

        JavaFutureAction<Void> task = context.wholeTextFiles(sourceDir)
            .foreachAsync(new VoidFunction<Tuple2<String,String>>(){

                @Override
                public void call(Tuple2<String, String> fileInfo) throws Exception {
                    File file = new File(fileInfo._1);
                    Instances instance = getInputInstance(file);
                    instance.setClassIndex(instance.numAttributes() - 1);
                    int[] indices = selectAttributes(instance);
                    selectData(instance, indices);
                    LOGGER.info("Finished executing: " + fileInfo._1);
                }

        });

        while(!task.isDone()){

        }
        writeMergedOutput();

        context.close();
    }

}

导致此异常的原因是什么以及如何解决?

我得到的例外是

org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
    at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:126)
    at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1.apply(AsyncRDDActions.scala:125)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.AsyncRDDActions.foreachAsync(AsyncRDDActions.scala:125)
    at org.apache.spark.api.java.JavaRDDLike$class.foreachAsync(JavaRDDLike.scala:732)
    at org.apache.spark.api.java.AbstractJavaRDDLike.foreachAsync(JavaRDDLike.scala:45)
    at geneselection.AttributeSelector.run(AttributeSelector.java:129)
    at geneselection.AttributeSelector.main(AttributeSelector.java:110)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:738)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.NotSerializableException: geneselection.AttributeSelector
Serialization stack:
    - object not serializable (class: geneselection.AttributeSelector, value: geneselection.AttributeSelector@5d43409a)
    - field (class: geneselection.AttributeSelector$1, name: this$0, type: class geneselection.AttributeSelector)
    - object (class geneselection.AttributeSelector$1, geneselection.AttributeSelector$1@210308d5)
    - field (class: org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, name: f$15, type: interface org.apache.spark.api.java.function.VoidFunction)
    - object (class org.apache.spark.api.java.JavaRDDLike$$anonfun$foreachAsync$1, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
    ... 22 more

1 个答案:

答案 0 :(得分:2)

根据the java documentation的java.io.NotSerializableException:

  

当实例需要具有Seri​​alizable接口时抛出。

所以你需要geneselection.AttributeSelector实施java.io.Serializabledocumentation