我正在使用UDAF在数据帧上使用自定义reduceByKey, 基于nexts链接,目标是按键获取(累加器,计数)。
数据是数据帧键值:
+----+-------+-------+
|key | accu | count |
+----+-------+-------+
| 1 | 750.0 | 2 |
| 2 | 400.0 | 2 |
| 3 | 350.0 | 1 |
+----+-------+-------+
从这里拿一些代码: Spark Dataset aggregation similar to RDD aggregate(zero)(accum, combiner) https://ragrawal.wordpress.com/2015/11/03/spark-custom-udaf-example/
下一个代码是实现,使用两个映射:
累加器的MapType(IntegerType->(DoubleType))
计数器的MapType(IntegerType->(LontType))
现在我想只使用一个地图或任何可存储两个数字的结构来存储两个值:
1)MapType(IntegerType-> Tuple2(DoubleType,LontType))但Tuple2不是sql类型
2)带有:case类acuCount的映射(acu:Double,count:Long)但是acuCount不是sql类型
3)ArrayType(DoubleType)
4)或任何可存储两个数字的结构
然后想要返回一个地图,或者如果可能返回另一个数据帧:
class GroupByAccCount extends org.apache.spark.sql.expressions.UserDefinedAggregateFunction {
// Input Data Type Schema: key,value
def inputSchema = new org.apache.spark.sql.types.StructType().add("k", org.apache.spark.sql.types.IntegerType).add("v", org.apache.spark.sql.types.DoubleType)
// Intermediate Schema: map(key:Integer,value:Double), map(key:Integer,value:Long)
def bufferSchema: org.apache.spark.sql.types.StructType = org.apache.spark.sql.types.StructType(org.apache.spark.sql.types.StructField("values", org.apache.spark.sql.types.MapType(org.apache.spark.sql.types.IntegerType, org.apache.spark.sql.types.DoubleType))::
org.apache.spark.sql.types.StructField("values", org.apache.spark.sql.types.MapType(org.apache.spark.sql.types.IntegerType, org.apache.spark.sql.types.LongType)):: Nil)
def deterministic: Boolean = true
def initialize(buffer: org.apache.spark.sql.expressions.MutableAggregationBuffer): Unit = {
buffer(0) = Map()
buffer(1) = Map()
// buffer(1)= map(groupid count)
}
//Sequence OP
def update(buffer: org.apache.spark.sql.expressions.MutableAggregationBuffer, row: org.apache.spark.sql.Row) : Unit = {
//Row
val key = row.getAs[Int](0)
val value = row.getAs[Double](1)
//Buffer(0) Map key->Acummulator
var mpAccum = buffer.getAs[Map[Int,Double]](0)
var v:Double = mpAccum.getOrElse(key, 0.0)
v= v + value
mpAccum = mpAccum + (key -> v)
buffer(0) = mpAccum
//Buffer(1) Map key->Counter
var mpCount = buffer.getAs[Map[Int,Long]](1)
var c:Long = mpCount.getOrElse(key, 0)
mpCount = mpCount + (key -> (c + 1L))
buffer(1) = mpCount
}
//Combine Op
// Merge two partial aggregates
def merge(buffer1: org.apache.spark.sql.expressions.MutableAggregationBuffer, buffer2: org.apache.spark.sql.Row) : Unit = {
//Buffer(0) Map key->Acummulator
var mpAccum1 = buffer1.getAs[Map[Int,Double]](0)
var mpAccum2 = buffer2.getAs[Map[Int,Double]](0)
mpAccum2 foreach {
case (k ,v) => {
var c:Double = mpAccum1.getOrElse(k, 0.0)
//c = c + v
mpAccum1 = mpAccum1 + (k -> (c + v))
}
}
buffer1(0) = mpAccum1
//Buffer(1) Map key->Counter
var mpCounter1 = buffer1.getAs[Map[Int,Long]](1)
var mpCounter2 = buffer2.getAs[Map[Int,Long]](1)
mpCounter2 foreach {
case (k ,v) => {
var c:Long = mpCounter1.getOrElse(k, 0)
//c = c + v
mpCounter1 = mpCounter1 + (k -> (c + v))
}
}
buffer1(1) = mpCounter1
}
// Returned Data Type:
def dataType: org.apache.spark.sql.types.DataType = org.apache.spark.sql.types.MapType(org.apache.spark.sql.types.IntegerType, org.apache.spark.sql.types.DoubleType)//, org.apache.spark.sql.types.MapType(org.apache.spark.sql.types.IntegerType, org.apache.spark.sql.types.LongType)
def evaluate(buffer: org.apache.spark.sql.Row): Any = {
buffer.getAs[Map[Int,Double]](0)
//buffer.getAs[Map[Int,Long]](1))
//Here want to return one map : key->(acc,count) or another dataframe
}
}
下一个包含两个地图的代码,但不完整,因为只返回一个:
{{1}}