如何在Spark中优化代码执行时间?

时间:2019-08-08 08:32:26

标签: scala apache-spark

我在斯卡拉有一个项目,该项目模拟了我国的经济。我编写了一个消息传递系统,人们可以将其发送到环境并接收回来。我使用spark并行化基于agentId的发送消息。我还编写了一个简单的测试来测量代码执行时间。对于50个人来说,运行代码大约需要30秒。我需要减少代码执行时间。在这种情况下我该怎么办?哪种方法可以帮助我?

我使用combineByKey根据每个代理的密钥ID组合它们的消息。我想使用groupByKey,reduceByKey等。但是我还能采取什么措施来减少代码执行时间呢?这是下面的消息传递部分。

@transient def run_until(until: Int) {
      println("RESUME Simulation " + this);

      var messages: List[Message] = List()

      // loop starts here
      vaxt {
        while (timer <= until) {
          if (!GLOBAL.silent) println("timer = " + timer);

          if (GLOBAL.RUN_SPARK) {
            //          s.setReceiveMessages(mx.getOrElse(s.id, List()))
            simsSpark = simsSpark.mapValues { s =>
              s.handleMessages()
              s.run_until(timer)._1.asInstanceOf[SimO]
            }.cache()

            var dMessages: RDD[(AgentId, List[Message])] = simsSpark.flatMap(_._2.getMessages).map(x => (x.receiverId, x)).combineByKey(
              (message: Message) => {
                List(message)
              },
              (l: List[Message], message: Message) => {
                message :: l
              },
              (l1: List[Message], l2: List[Message]) => {
                l1 ::: l2
              }
            ).cache()


            // Environment answers immediatley for the next step
            val envMessages: RDD[(AgentId, List[Message])] = dMessages.filter(_._1 == ENVIRONMENT_ID).flatMap(_._2).flatMap(handleEnvMessage).map(x => (x.receiverId, x)).combineByKey(
              (message: Message) => {
                List(message)
              },
              (l: List[Message], message: Message) => {
                message :: l
              },
              (l1: List[Message], l2: List[Message]) => {
                l1 ::: l2
              }
            ).cache()

            // Append environment to messages: important merge both together (groupByKey otherwise elements may be duplicated at join afterwards)
            dMessages = dMessages.filter(_._1 != ENVIRONMENT_ID).union(envMessages).groupByKey().mapValues(_.flatten.toList)
            dMessages = dMessages.cache()

            simsSpark = simsSpark.leftOuterJoin(dMessages).mapValues { x =>
              x._1.setReceiveMessages(x._2.getOrElse(List()))
              x._1
            }.persist()


            if (!GLOBAL.silent) {
              simsSpark.foreach(_._2.stat)
              println()
              println()
            }

          } else {
            messages = messages.filter(_.receiverId == ENVIRONMENT_ID).flatMap(handleEnvMessage) ::: messages.filter(_.receiverId != ENVIRONMENT_ID)

            val mx = messages.groupBy(_.receiverId)
            messages = List()

            market = market.map(m => {
              m._2.setReceiveMessages(mx.getOrElse(m._2.id, List()))
              m._2.handleMessages()
              m
            })

            messages = market.flatMap(_._2.getMessages).toList ::: messages

            sims = sims.map {
              s =>
                s.setReceiveMessages(mx.getOrElse(s.id, List()))
                s.handleMessages()
                s.run_until(timer)._1.asInstanceOf[SimO]
            }
            messages = sims.flatMap(_.getMessages) ::: messages

            if (!GLOBAL.silent) {
              for (s <- sims) s.stat;
              println();
              println();
            }
          }

          timer += 1;
        }
      }
      //loop finish here


      println("STOP Simulation " + this);
    }

0 个答案:

没有答案