使用PREGEL API处理大量的分层数据的效果非常差

时间:2019-06-18 12:23:55

标签: apache-spark bigdata spark-graphx

我使用Spark2.4 Graphx尝试实现一个分层问题。该实现提供了预期的结果,但要花费大量的完整数据。我使用了4种不同类型的分区策略,但是性能无法提高。在此实施过程中,将使用PREGEL API。

//创建顶点RDD。主键,根,路径

val verticesRDD = vertexDF.rdd
  .map { x => (x.get(0), x.get(1), x.get(2)) }
  .map { x => (MurmurHash3.stringHash(x._1.toString).toLong, (x._1.asInstanceOf[Any], x._2.asInstanceOf[Any], x._3.asInstanceOf[String])) }

//创建边缘RDD。自上而下的关系

val edgesRDD = edgeDF.rdd.map { x => (x.get(0), x.get(1)) }
      .map { x => Edge(MurmurHash3.stringHash(x._1.toString).toLong, MurmurHash3.stringHash(x._2.toString).toLong, "topdown") }  
graph = Graph(verticesRDD, edgesRDD).partitionBy(EdgePartition2D).cache()
val pathSeperator = """/"""
val initialMsg = (0L, 0, 0.asInstanceOf[Any], List("dummy"), 0, 1)
val initialGraph = graph.mapVertices((id, v) => (id, 0, v._2, List(v._3), 0, v._3, 1, v._1))
val hrchyRDD = initialGraph.pregel(initialMsg,Int.MaxValue,     EdgeDirection.Out)(setMsg,sendMsg,mergeMsg)   

def setMsg(vertexId: VertexId, value: (Long, Int, Any, List[String], Int, String, Int, Any), message: (Long, Int, Any, List[String], Int, Int)): (Long, Int, Any, List[String], Int, String, Int, Any) = {
    if (message._2 < 1) { //superstep 0 - initialize
      (value._1, value._2 + 1, value._3, value._4, value._5, value._6, value._7, value._8)
    } else if (message._5 == 1) { // set isCyclic
      (value._1, value._2, value._3, value._4, message._5, value._6, value._7, value._8)
    } else if (message._6 == 0) { // set isleaf
      (value._1, value._2, value._3, value._4, value._5, value._6, message._6, value._8)
    } else { // set new values
      (message._1, value._2 + 1, message._3, value._6 :: message._4, value._5, value._6, value._7, value._8)
    }
  }

  // send the value to vertices
  def sendMsg(triplet: EdgeTriplet[(Long, Int, Any, List[String], Int, String, Int, Any), _]): Iterator[(VertexId, (Long, Int, Any, List[String], Int, Int))] = {
    val sourceVertex = triplet.srcAttr
    val destinationVertex = triplet.dstAttr
    // check for icyclic
    if (sourceVertex._1 == triplet.dstId || sourceVertex._1 == destinationVertex._1)
      if (destinationVertex._5 == 0) { //set iscyclic
        Iterator((triplet.dstId, (sourceVertex._1, sourceVertex._2, sourceVertex._3, sourceVertex._4, 1, sourceVertex._7)))
      } else {
        Iterator.empty
      }
    else {
      if (sourceVertex._7 == 1) //is NOT leaf
      {
        Iterator((triplet.srcId, (sourceVertex._1, sourceVertex._2, sourceVertex._3, sourceVertex._4, 0, 0)))
      } else { // set new values
        Iterator((triplet.dstId, (sourceVertex._1, sourceVertex._2, sourceVertex._3, sourceVertex._4, 0, 1)))
      }
    }
  }

  def mergeMsg(msg1: (Long, Int, Any, List[String], Int, Int), msg2: (Long, Int, Any, List[String], Int, Int)): (Long, Int, Any, List[String], Int, Int) = {
    msg2
  }

需要帮助来解决性能问题。

0 个答案:

没有答案