Spark在尝试运行并行作业时为单个作业打开多个线程

时间:2018-05-18 08:54:34

标签: apache-spark apache-spark-sql akka-http

我们有一个用例,我们需要通过rest-api(akka http)在单个spark会话上运行并行spark sql查询。

Application Conf

my-blocking-dispatcher {
  type = Dispatcher
  executor = "thread-pool-executor"
  thread-pool-executor {
    // or in Akka 2.4.2+
    fixed-pool-size = 4
  }
  throughput = 100
} 

Spark Service

import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart}
import org.apache.spark.sql.execution.ui.CustomSqlListener
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.mutable.ListBuffer
import scala.concurrent.{ExecutionContext, Future}
import scala.util.parsing.json.JSON

trait SparkService {

  val session = SparkSession
                .builder()
                .config("spark.scheduler.mode", "FAIR")
                .appName("QueryCancellation")
                .master("local[*]")
                .enableHiveSupport()
                .getOrCreate()

  var queryJobMapStart = Map[String, String]()
  var queryStatusMap = Map[String,String]()
  session.sparkContext.setLogLevel("ERROR")
  session.sparkContext.setCallSite("Reading the file")
  val dataDF = session.read
      .format("csv")
      .option("inferSchema","true")
      .option("header","true")
      .load("C:\\dev\\QueryCancellation\\src\\main\\resources\\Baby_Names__Beginning_2007.csv")



  dataDF.createOrReplaceTempView("data_tbl")


  //dataDF.printSchema()

  val customListener = new CustomSqlListener(session.sparkContext.getConf,queryJobMapStart,queryStatusMap)
  val appListener = session.sparkContext.addSparkListener(customListener)


  def runQuery(query : String, queryId: String)(implicit  ec : ExecutionContext)=  {


  //  println("queryId: "+ queryId +" query:" + query)
    session.sparkContext.setLocalProperty("callSite.short",queryId)
    session.sparkContext.setLocalProperty("callSite.long",query)

     session.sql(query).show(2)
    //Thread.sleep(60000)
   // Future(data)

  }

}

object SparkService extends SparkService 

查询服务

import java.util.UUID
import java.util.concurrent.ConcurrentHashMap
import akka.actor.ActorSystem
import akka.http.scaladsl.server.Route
import akka.http.scaladsl.server.Directives._
import akka.stream.ActorMaterializer
trait QueryService extends SparkService {
  implicit val system: ActorSystem
  implicit val materializer : ActorMaterializer
  // implicit val sparkSession: SparkSession
  // val datasetMap = new ConcurrentHashMap[String, Dataset[Row]]()
  implicit val blockingDispatcher = system.dispatchers.lookup("my-blocking-dispatcher")
  val route: Route =
    pathSingleSlash {
      get {
        complete {
          "welcome to rest service"
        }
      }
    } ~
      path("runQuery" / "county"/Segment) { county  =>
        get {
          complete{
            var res= ""
            val documentId = "user ::" + UUID.randomUUID().toString
            val queryId = System.nanoTime().toString
            val stmt = "select a.sex,count(*) from data_tbl a,data_tbl b where b.county=a.county and a.country= '"+county+"' group by a.sex"
            val result = runQuery(stmt,queryId)
            /* var entity = queryResult match {
               case Some(result) =>s"Query : $stmt  is submitted. Query id is $result. User id is $documentId"
               case None => s"Query : $stmt could not be submitted. User id is $documentId"
             }*/
            /*result.onComplete{
              case Success(value) => println(s"Query completed")
              case Failure(e) =>  None
            }*/
            var entity = s"Query : $stmt  is submitted. Query id is $queryId. User id is $documentId"
            entity
          }
        }
      } ~
      path("getStatus" / """[\w[0-9]-_]+""".r) { id =>
        get {
          complete {
            val statusResult = getStatus(id)
            var res = statusResult match {
              case Some(result) =>  s"Status for query id : $id is $result"
              case None =>  s"Could not find the status of the query id : $id"
            }
            res
          }
        }
      } ~
      path("killQuery" / """[\w[0-9]-_]+""".r) { id =>
        get {
          complete {
            val statusResult = killQuery(id)
            s"Query id $id is cancelled."
          }
        }
      }
}

查询服务器

import akka.actor.ActorSystem
import akka.http.scaladsl.Http
import akka.stream.ActorMaterializer

import scala.concurrent.Future
//import scala.concurrent.ExecutionContext.Implicits.global

class QueryServer (implicit val system:ActorSystem ,
                   implicit val materializer: ActorMaterializer) extends QueryService {

  def startServer(address : String, port: Int) = {
      Http().bindAndHandle(route,address,port)
  }

}

    object QueryServer extends App {

      implicit val actorSystem = ActorSystem("query-server")
      implicit val materializer = ActorMaterializer()
      val server = new QueryServer()
      server.startServer("localhost",8080)
      println("running server at localhost 8080")

    }

当我尝试通过http:localhost:8080/runQuery/county/'KINGS'对spark sql运行查询时,会创建多个作业ID,并跳出最大值。

以下是Spark UI的屏幕截图。我无法理解为什么要创建突出显示的任务。

enter image description here

以下是控制台日志,显示作业仅执行一次:

"running server at localhost 8080
173859599588358->2
****************************************************************************************
****************************************************************************************
Job id 2 is completed
--------------------------------------------------------
173859599588358->3
****************************************************************************************
****************************************************************************************
Job id 3 is completed
--------------------------------------------------------
173859599588358->4
****************************************************************************************
****************************************************************************************
Job id 4 is completed
--------------------------------------------------------
173859599588358->5
****************************************************************************************
****************************************************************************************
Job id 5 is completed
--------------------------------------------------------
173859599588358->6
****************************************************************************************
****************************************************************************************
Job id 6 is completed
--------------------------------------------------------
173859599588358->7
****************************************************************************************
****************************************************************************************
Job id 7 is completed
--------------------------------------------------------
+---+--------+
|sex|count(1)|
+---+--------+
|  F|12476769|
|  M|12095080|
+---+--------+

Spark版本: - 2.2.1

2 个答案:

答案 0 :(得分:1)

除了逻辑的其余部分之外,部分option("inferSchema","true")将触发Spark作业。有关详细信息,请参阅Spark API

由于乔布斯是短暂的,它们可能是连续的而不是平行的(从开始时间开始讲述)。您是否看过Spark UI的SQL选项卡。所有这些作业(推断模式的作业除外)都可能是一个执行的SQL查询的一部分。

答案 1 :(得分:1)

看起来火花催化剂优化器正在优化查询。 创建了多个DAG,可能选择了最佳执行计划。 您可以看到执行计划,可能有多个。 我认为这里与akka http没有任何关系。 尝试在spark shell中运行代码并验证声明。