我编写了一个聚合函数,它返回Long数据列的范围编码表示。我在一个有50列的1 GB镶木地板文件上运行它。我的集群有55个执行程序,每个节点有4个核心。即使在缓存数据帧之后,运行时间也大约为5分钟。有没有办法以更有效的方式运行此查询?
这是UDAF -
class Concat extends UserDefinedAggregateFunction {
def inputSchema: org.apache.spark.sql.types.StructType =
StructType(StructField("value", LongType) :: Nil)
def bufferSchema: StructType = StructType(
StructField("concatenation",ArrayType(LongType,false) ) :: Nil
)
def dataType: DataType = ArrayType(LongType,false)
def deterministic: Boolean = true
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer.update(0, new ArrayBuffer[Long]() )
}
def update(buffer: MutableAggregationBuffer,input: Row): Unit = {
val l=buffer.getAs[ ArrayBuffer[Long] ](0).toBuffer.asInstanceOf[ ArrayBuffer[Long] ]
val v=input.getAs[ Long ](0)
val n=l.size
if(n<2){
l += v
l += 0L
}
else{
val x1 = l(n-2)
val x2 = l(n-1)
if( x1-1 == v){
l(n-2)= v
l(n-1)= x2+1
}
else if(x1+x2+1 == v)
l(n-1)= x2+1
else{
l += v
l += 0L
}
}
buffer.update(0,l)
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val a=buffer1.getAs[ WrappedArray[Long] ](0)
val b=buffer2.getAs[ WrappedArray[Long] ](0)
buffer1.update(0,a ++ b)
}
def evaluate(buffer: Row): Any = {
buffer.getSeq(0)
}
}
以下是我运行查询的方式 -
val concat = new Concat
sqlContext.udf.register("lcon", concat)
val df=sqlContext.read.parquet("file_url")
df.cache
df.registerTempTable("agg11")
val results=sqlContext.sql("SELECT lcon(Id) from agg11 WHERE Status IN (1) AND Device IN (1,4) AND Medium IN (1)").collect