我在迭代算法中使用graphx api。虽然I have carefully cache/ unpersist rdd, and take care of the vertices partition num。在一个线性趋势中,时间成本似乎仍然每轮增加。我的代码的简化版本如下,它也会遇到同样的问题:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object ComputingTimeProblem extends App {
Logger.getLogger("org").setLevel(Level.ERROR)
Logger.getLogger("akka").setLevel(Level.ERROR)
val conf = new SparkConf().setMaster("local[1]").setAppName("test")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
var graph = GraphGenerators
.logNormalGraph(sc, 15000).mapVertices((_, _) => 1d)
.cache
graph.vertices.take(10).foreach(println)
val maxIter = 50
var preGraph: Graph[Double, Int] = null
var allTime: ArrayBuffer[Double] = ArrayBuffer()
for (i <- 1 to maxIter) {
val begin = System.currentTimeMillis()
preGraph = graph
val vertices2 = graph.triplets.map(tri => (tri.srcId, tri.dstAttr)).reduceByKey(_ + _)
graph = graph.joinVertices(vertices2)((vid, left, right) => left + right).cache
graph.vertices.take(10)
preGraph.unpersist()
val end = System.currentTimeMillis()
val duration = (end - begin) / (60 * 1000d)
allTime += duration
println(s"Round ${i} Time Cost: %.4f min, Vertices Partition Num: %d".format(
duration, graph.vertices.getNumPartitions))
}
graph.vertices.take(10).foreach(println)
val avgTime = allTime.sum / allTime.size
println(s"Average Time = ${avgTime}")
val timeCostDiffs = for (i <- 1 until maxIter) yield (allTime(i) - allTime(i - 1))
timeCostDiffs
.zipWithIndex
.map(x => "Round %d to %d, Time Cost Diff: %.4f min".format(x._2+1, x._2 + 2, x._1))
.foreach(println)
println("tc\n"+allTime.mkString("\n"))
}
我没有改变图形对象的索引,并且graphx会通过leftZipJoin方法加入顶点,这不需要改组,所以为什么时间成本仍然每轮增加。 任何人都可以给出一些建设性的选择,谢谢?!
答案 0 :(得分:1)
它仍然是一个沿袭问题,我刚刚发现。 Graph对象有两个rdd:vertex rdd和edge rdd。在上面的代码中,我刚刚实现了顶点rdd,而不是edge rdd。所以,每一轮,它都会重新计算前面的agagin边缘。因此,使用三元组对象实现rdd将解决问题,如下所示:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object ComputingTimeProblem extends App {
Logger.getLogger("org").setLevel(Level.ERROR)
Logger.getLogger("akka").setLevel(Level.ERROR)
val conf = new SparkConf().setMaster("local[1]").setAppName("test")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
var graph = GraphGenerators
.logNormalGraph(sc, 15000).mapVertices((_, _) => 1d)
// .partitionBy(PartitionStrategy.RandomVertexCut,8)
.cache
graph.vertices.take(10).foreach(println)
val maxIter = 50
var preGraph: Graph[Double, Int] = null
var allTime: ArrayBuffer[Double] = ArrayBuffer()
for (i <- 1 to maxIter) {
val begin = System.currentTimeMillis()
preGraph = graph
val vertices2 = graph.triplets.map(tri => (tri.srcId, tri.dstAttr)).reduceByKey(_ + _)
graph = graph.joinVertices(vertices2)((vid, left, right) => left + right).cache
graph.triplets.take(10) // here materialize both vertex and edge rdd
// graph.vertices.take(10)
preGraph.unpersist()
val end = System.currentTimeMillis()
val duration = (end - begin) / (60 * 1000d)
allTime += duration
println(s"Round ${i} Time Cost: %.4f min, Vertices Partition Num: %d".format(
duration, graph.vertices.getNumPartitions))
}
graph.vertices.take(10).foreach(println)
val avgTime = allTime.sum / allTime.size
println(s"Average Time = ${avgTime}")
val timeCostDiffs = for (i <- 1 until maxIter) yield (allTime(i) - allTime(i - 1))
timeCostDiffs
.zipWithIndex
.map(x => "Round %d to %d, Time Cost Diff: %.4f min".format(x._2 + 1, x._2 + 2, x._1))
.foreach(println)
println("tc\n" + allTime.mkString("\n"))
}