所以我有以下代码
JavaRDD<Edge> distEdges = sc.parallelize(edges);
//c-c
JavaPairRDD<Integer, Tuple2<Integer, Double>> ctoC = distEdges.filter((e) -> {
return bgraph.getValue().isC(e.getU()) && bgraph.getValue().isC(e.getV());
}).mapToPair((e) -> {
return new Tuple2(e.getU(), new Tuple2(e.getV(),e.getWeight()));
});
//f - c
JavaPairRDD<Integer, Tuple2<Integer, Double>> ftoC = distEdges.filter( e -> {
return ((!bgraph.getValue().isC(e.getU())) && bgraph.getValue().isC(e.getV()));
}).flatMapToPair((Edge e) -> {
int u = e.getU();
int v = e.getV();
List<Tuple2<Integer, Tuple2<Integer, Double>>> list = new ArrayList();
for (Edge cEdge : g.cAdj(u)) {//get coarse neighbors of the fine edges
int nb = cEdge.getEndpoint(u);
if (nb != v) {
double w = cEdge.getPij() * e.getWeight();
if(w !=0){
//addToSyncTable(cEdges, v, nb, w);
list.add(new Tuple2(v,new Tuple2(nb,w)));
}
}
}
return list.iterator();
});
JavaPairRDD<Integer, Tuple2<Integer, Double>> ctoF = distEdges.filter( e -> {
return (bgraph.getValue().isC(e.getU()) && (!bgraph.getValue().isC(e.getV())));
}).flatMapToPair((Edge e) -> {
int u = e.getU();//coarse
int v = e.getV();//fine
List<Tuple2<Integer, Tuple2<Integer, Double>>> list = new ArrayList();
for (Edge cEdge : g.cAdj(v)) {//get coarse neighbors of the fine edges
int nb = cEdge.getEndpoint(v);
if (nb != u) {
double w = cEdge.getPij() * e.getWeight();
if(w !=0){
//addToSyncTable(cEdges, v, nb, w);
list.add(new Tuple2(u,new Tuple2(nb,w)));
}
}
}
return list.iterator();
});
//f - f
JavaPairRDD<Integer, Tuple2<Integer, Double>> ftoF = distEdges.filter( e -> {
return (!bgraph.getValue().isC(e.getU())) && (!bgraph.getValue().isC(e.getV()));
}).flatMapToPair((Edge e) -> {
int u = e.getU();
int v = e.getV();
List<Tuple2<Integer, Tuple2<Integer, Double>>> list = new ArrayList();
for (Edge cEdgeU : g.cAdj(u)) {//get coarse neighbors of the fine edges
//System.out.println("F-F");
int uNb = cEdgeU.getEndpoint(u);
for (Edge cEdgeV : g.cAdj(v)) {
int vNb = cEdgeV.getEndpoint(v);
if (uNb != vNb) {
double w = cEdgeU.getPij() * e.getWeight() * cEdgeV.getPij();
if(w !=0){
//addToTable(coarseEdges, idMap.get(uNb), idMap.get(vNb), w);
list.add(new Tuple2(uNb,new Tuple2(vNb,w)));
}
}
}
}
return list.iterator();
});
List<Tuple2<Integer, Iterator<Tuple2<Integer,Double>> >> cEdges = ctoC.union(ftoF).union(ftoC).union(ctoF).groupByKey().mapValues((a)->{
HashMap<Integer, Double> curEdges = new HashMap();
for(Tuple2<Integer, Double> t: a){
curEdges.merge(t._1(), t._2(), (weight, newWeight)->weight + newWeight);
}
ArrayList<Tuple2<Integer, Double>> edgesT = new ArrayList(curEdges.size());
curEdges.entrySet().forEach((e) -> {
edgesT.add(new Tuple2(e.getKey(), e.getValue()));
});
return edgesT.iterator();
}).collect();
Graph cg = new Graph(g, cEdges, seeds, sc);
sc.stop();
我尝试运行时遇到以下异常:
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 1.0 (TID 4) had a not serializable result: java.util.ArrayList$Itr
Serialization stack:
- object not serializable (class: java.util.ArrayList$Itr, value: java.util.ArrayList$Itr@e6275ea)
- field (class: scala.Tuple2, name: _2, type: class java.lang.Object)
- object (class scala.Tuple2, (7532,java.util.ArrayList$Itr@e6275ea))
- element of array (index: 0)
- array (class [Lscala.Tuple2;, size 3068)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1958)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:935)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.collect(RDD.scala:934)
at org.apache.spark.api.java.JavaRDDLike$class.collect(JavaRDDLike.scala:361)
at org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
at com.acs.clemson.ordering.graph.GraphBuilder.buildByTriples(GraphBuilder.java:122)
at com.acs.clemson.ordering.algo.AmgCoarsener.coarsen(AmgCoarsener.java:51)
at Manager.ML(Manager.java:73)
at Main.main(Main.java:16)
当我研究这个时,结果是ArrayList.iterator()返回一个Itr对象,它的类没有实现Serializable。
还有其他方法可以在spark中返回一个iterable吗?
UPDATE :降级到Spark verison 1.6.2并返回列表(iterable)而不是迭代器修复了问题。从版本2.1.0开始,您只能从flatMapToPair
返回迭代器,我仍然不确定代码为什么会破坏2.1.0。