如何使用spark在scala中实现并行处理

时间:2017-07-12 03:25:12

标签: multithreading scala apache-spark parallel-processing

我有一段代码,它一次运行一个作业,从oracle导入数据并将其放入HDFS。目前,一次只能运行一个作业。有什么方法可以通过一次执行两个作业来提高性能,还可以根据可用资源配置要执行的作业数量?

  def runETL(aInput: Any) {

    // Copy a trigger file to HDFS to launch the Job (90 tables) and Incremental (10 tables) load Job
    copyBytesToFile("Oracle DB refreshed", ConfigFactory.load.getString("oracle.status.file.path"))
    var start = System.currentTimeMillis    

      val maxlastActionDteTime: String = EventDAO.getMaxLastActionDate      

      start = System.currentTimeMillis
      EventDAO.loadStagingTables(maxlastActionDteTime, ConfigFactory.load.getInt("facets.oracledb"),
        ConfigFactory.load.getInt("facets.oracledb"), ConfigFactory.load.getInt("facets.oracledb"))


      case e: Exception => {
        logger.error(getStackTraceAsString(e))
        MailUtil.sendMail(s"ETLScheduler Error -> " + e.getMessage)
        throw new RuntimeException("runETL was terminated due to an Exception.")
      }
    }
    val sparkAppHandle = new SparkLauncher()
      .setSparkHome(ConfigFactory.load.getString("spark.home.location"))
      .setAppResource(ConfigFactory.load.getString("spark.resource.jar.location"))
      .setMainClass("com.s.PIDriver")
      .setMaster("yarn-cluster")
      .setConf("spark.executor.memory", ConfigFactory.load.getString("spark.conf.executor.memory."))
      .setConf("spark.executor.instances", ConfigFactory.load.getString("spark.conf.executor.instances."))
      .setConf("spark.executor.cores", ConfigFactory.load.getString("spark.conf.executor.cores."))
      .setConf("spark.yarn.queue", ConfigFactory.load.getString("spark.conf.queue."))
      .setConf("spark.driver.memory", ConfigFactory.load.getString("spark.conf.driver.memory."))
      .startApplication()

    sparkAppHandle.addListener(new SparkAppHandle.Listener() {
      @Override
      def stateChanged(handle: SparkAppHandle) = {
        // This method is called when the Application toggles between states (CONNECTED,SUBMITTED,RUNNING,FINISHED or FAILED or KILLED)
        val appState = handle.getState


        if (appState.isFinal) {
          // Copy a trigger file to HDFS to launch the Table load Job only if the App finished and did not fail or was not killed.
          if (appState == SparkAppHandle.State.FINISHED) {
            //Compute Stats and Invalidate Metadata
            start = System.currentTime
            EventDAO.updateTableMetadata
            logger.info("Hive and Impala metadata refreshed in " + (System.currentTime - start) + " seconds..so copying the _refresh.done file")
            copyBytesToFile(" ETL Spark Job and Metadata updates completed", ConfigFactory.load.getString(".status.file.path"))
            logger.info("**************  ETL Spark Job and Metadata updates were completed successfully... **************")
          } else if (appState == SparkAppHandle.State.KILLED || appState == SparkAppHandle.State.FAILED || appState == SparkAppHandle.State.UNKNOWN) {
            MailU.send(s" Job did not complete, Application finished with status, " + appState)
            throw new RuntimeException(" Job did not complete, Application finished with status, " + appState)

0 个答案:

没有答案