Apache Spark:返回List Iterator时的对象不可序列化异常

时间:2017-01-29 23:24:28

标签: java apache-spark

所以我有以下代码

 JavaRDD<Edge> distEdges = sc.parallelize(edges);
    //c-c
    JavaPairRDD<Integer, Tuple2<Integer, Double>> ctoC = distEdges.filter((e) -> {
        return bgraph.getValue().isC(e.getU()) && bgraph.getValue().isC(e.getV());
    }).mapToPair((e) -> {
        return new Tuple2(e.getU(), new Tuple2(e.getV(),e.getWeight()));
    });

    //f - c 
    JavaPairRDD<Integer, Tuple2<Integer, Double>> ftoC = distEdges.filter( e -> {
        return ((!bgraph.getValue().isC(e.getU())) && bgraph.getValue().isC(e.getV()));
    }).flatMapToPair((Edge e) -> {
        int u = e.getU();
        int v = e.getV();
        List<Tuple2<Integer, Tuple2<Integer, Double>>> list = new ArrayList();
        for (Edge cEdge : g.cAdj(u)) {//get coarse neighbors of the fine edges
                int nb = cEdge.getEndpoint(u);
                if (nb != v) {
                    double w = cEdge.getPij() * e.getWeight();
                    if(w !=0){
                        //addToSyncTable(cEdges, v, nb, w);
                        list.add(new Tuple2(v,new Tuple2(nb,w)));
                    }
                }
            }
        return list.iterator();
    });

    JavaPairRDD<Integer, Tuple2<Integer, Double>> ctoF = distEdges.filter( e -> {
        return (bgraph.getValue().isC(e.getU()) && (!bgraph.getValue().isC(e.getV())));
    }).flatMapToPair((Edge e) -> {
        int u = e.getU();//coarse
        int v = e.getV();//fine
        List<Tuple2<Integer, Tuple2<Integer, Double>>> list = new ArrayList();
        for (Edge cEdge : g.cAdj(v)) {//get coarse neighbors of the fine edges
                int nb = cEdge.getEndpoint(v);
                if (nb != u) {
                    double w = cEdge.getPij() * e.getWeight();
                    if(w !=0){
                        //addToSyncTable(cEdges, v, nb, w);
                        list.add(new Tuple2(u,new Tuple2(nb,w)));
                    }
                }
            }
        return list.iterator();
    });

    //f - f
    JavaPairRDD<Integer, Tuple2<Integer, Double>> ftoF = distEdges.filter( e -> {
        return (!bgraph.getValue().isC(e.getU())) && (!bgraph.getValue().isC(e.getV()));
    }).flatMapToPair((Edge e) -> {
        int u = e.getU();
        int v = e.getV();
        List<Tuple2<Integer, Tuple2<Integer, Double>>> list = new ArrayList();
        for (Edge cEdgeU : g.cAdj(u)) {//get coarse neighbors of the fine edges
            //System.out.println("F-F");

            int uNb = cEdgeU.getEndpoint(u);
            for (Edge cEdgeV : g.cAdj(v)) {
                int vNb = cEdgeV.getEndpoint(v);
                if (uNb != vNb) {
                    double w = cEdgeU.getPij() * e.getWeight() * cEdgeV.getPij();
                    if(w !=0){
                        //addToTable(coarseEdges, idMap.get(uNb), idMap.get(vNb), w);
                        list.add(new Tuple2(uNb,new Tuple2(vNb,w)));
                    }
                }
            }
        }
        return list.iterator();
    });



    List<Tuple2<Integer, Iterator<Tuple2<Integer,Double>> >> cEdges = ctoC.union(ftoF).union(ftoC).union(ctoF).groupByKey().mapValues((a)->{
        HashMap<Integer, Double> curEdges = new HashMap();
        for(Tuple2<Integer, Double> t: a){
            curEdges.merge(t._1(), t._2(), (weight, newWeight)->weight + newWeight);
        }

        ArrayList<Tuple2<Integer, Double>> edgesT = new ArrayList(curEdges.size());

        curEdges.entrySet().forEach((e) -> {
            edgesT.add(new Tuple2(e.getKey(), e.getValue()));
        });

        return edgesT.iterator();
    }).collect();

    Graph cg = new Graph(g, cEdges, seeds, sc);
    sc.stop();

我尝试运行时遇到以下异常:

Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 1.0 (TID 4) had a not serializable result: java.util.ArrayList$Itr
Serialization stack:
- object not serializable (class: java.util.ArrayList$Itr, value: java.util.ArrayList$Itr@e6275ea)
- field (class: scala.Tuple2, name: _2, type: class java.lang.Object)
- object (class scala.Tuple2, (7532,java.util.ArrayList$Itr@e6275ea))
- element of array (index: 0)
- array (class [Lscala.Tuple2;, size 3068)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
at org.apache.spark.api.java.JavaRDDLike$class.collect(JavaRDDLike.scala:361)
at org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
at com.acs.clemson.ordering.graph.GraphBuilder.buildByTriples(GraphBuilder.java:122)
at com.acs.clemson.ordering.algo.AmgCoarsener.coarsen(AmgCoarsener.java:51)
at Manager.ML(Manager.java:73)
at Main.main(Main.java:16)

当我研究这个时,结果是ArrayList.iterator()返回一个Itr对象,它的类没有实现Serializable。

还有其他方法可以在spark中返回一个iterable吗?

UPDATE :降级到Spark verison 1.6.2并返回列表(iterable)而不是迭代器修复了问题。从版本2.1.0开始,您只能从flatMapToPair返回迭代器,我仍然不确定代码为什么会破坏2.1.0。

0 个答案:

没有答案