Question

我编写了一个聚合函数，它返回Long数据列的范围编码表示。我在一个有50列的1 GB镶木地板文件上运行它。我的集群有55个执行程序，每个节点有4个核心。即使在缓存数据帧之后，运行时间也大约为5分钟。有没有办法以更有效的方式运行此查询？

这是UDAF -

class Concat extends UserDefinedAggregateFunction {
  def inputSchema: org.apache.spark.sql.types.StructType =
    StructType(StructField("value", LongType) :: Nil)

  def bufferSchema: StructType = StructType(
    StructField("concatenation",ArrayType(LongType,false) ) :: Nil
  )

  def dataType: DataType = ArrayType(LongType,false)

  def deterministic: Boolean = true

  def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer.update(0, new ArrayBuffer[Long]() )
  }

  def update(buffer: MutableAggregationBuffer,input: Row): Unit = {
    val l=buffer.getAs[ ArrayBuffer[Long] ](0).toBuffer.asInstanceOf[ ArrayBuffer[Long] ]
    val v=input.getAs[ Long ](0)

    val n=l.size

    if(n<2){
         l += v
         l += 0L
    }
    else{
       val x1 = l(n-2)
       val x2 = l(n-1)
       if( x1-1 == v){
          l(n-2)= v
          l(n-1)= x2+1
       }
       else if(x1+x2+1 == v)
          l(n-1)= x2+1
       else{
          l += v
          l += 0L
       }
    }

    buffer.update(0,l)
  }

  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    val a=buffer1.getAs[ WrappedArray[Long] ](0)
    val b=buffer2.getAs[ WrappedArray[Long] ](0)

    buffer1.update(0,a ++ b)
  }

  def evaluate(buffer: Row): Any = {
    buffer.getSeq(0)
  }
}

以下是我运行查询的方式 -

val concat = new Concat
sqlContext.udf.register("lcon", concat)

val df=sqlContext.read.parquet("file_url")
df.cache
df.registerTempTable("agg11")

val results=sqlContext.sql("SELECT lcon(Id) from agg11 WHERE Status IN (1) AND Device IN (1,4) AND Medium IN (1)").collect

为Spark SQL编写有效的聚合函数

0 个答案: