我使用Spark 2.0.1和Scala 2.11。
这是与Spark中用户定义的聚合函数(UDAF)相关的问题。我使用here提供的示例答案来提问我的问题:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{Row, Column}
object DummyUDAF extends UserDefinedAggregateFunction {
def inputSchema = new StructType().add("x", StringType)
def bufferSchema = new StructType()
.add("buff", ArrayType(LongType))
.add("buff2", ArrayType(DoubleType))
def dataType = new StructType()
.add("xs", ArrayType(LongType))
.add("ys", ArrayType(DoubleType))
def deterministic = true
def initialize(buffer: MutableAggregationBuffer) = {}
def update(buffer: MutableAggregationBuffer, input: Row) = {}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {}
def evaluate(buffer: Row) = (Array(1L, 2L, 3L), Array(1.0, 2.0, 3.0))
}
我可以轻松返回多个Map
而不是Array
,但无法在update
方法中改变地图。
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{Row, Column}
import scala.collection.mutable.Map
object DummyUDAF extends UserDefinedAggregateFunction {
def inputSchema = new StructType().add("x", DoubleType).add("y", IntegerType)
def bufferSchema = new StructType()
.add("buff", MapType(DoubleType, IntegerType))
.add("buff2", MapType(DoubleType, IntegerType))
def dataType = new StructType()
.add("xs", MapType(DoubleType, IntegerType))
.add("ys", MapType(DoubleType, IntegerType))
def deterministic = true
def initialize(buffer: MutableAggregationBuffer) = {
buffer(0) = scala.collection.mutable.Map[Double,Int]()
buffer(1) = scala.collection.mutable.Map[Double,Int]()
}
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer(0).asInstanceOf[Map[Double,Int]](input.getDouble(0)) = input.getInt(1)
buffer(1).asInstanceOf[Map[Double,Int]](input.getDouble(0)*10) = input.getInt(1)*10
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
buffer1(0).asInstanceOf[Map[Double,Int]] ++= buffer2(0).asInstanceOf[Map[Double,Int]]
buffer1(1).asInstanceOf[Map[Double,Int]] ++= buffer2(1).asInstanceOf[Map[Double,Int]]
}
//def evaluate(buffer: Row) = (Map(1.0->10,2.0->20), Map(10.0->100,11.0->110))
def evaluate(buffer: Row) = (buffer(0).asInstanceOf[Map[Double,Int]], buffer(1).asInstanceOf[Map[Double,Int]])
}
这编译得很好,但是会产生运行时错误:
val df = Seq((1.0, 1), (2.0, 2)).toDF("k", "v")
df.select(DummyUDAF($"k", $"v")).show(1, false)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 70.0 failed 4 times, most recent failure: Lost task 1.3 in stage 70.0 (TID 204, 10.91.252.25): java.lang.ClassCastException: scala.collection.immutable.Map$EmptyMap$ cannot be cast to scala.collection.mutable.Map
另一个解决方案here表示由于MapType StructType
,这可能是一个问题。但是,当我尝试提到的解决方案时,我仍然得到相同的错误。
val distudaf = new DistinctValues
val df = Seq(("a", "a1"), ("a", "a1"), ("a", "a2"), ("b", "b1"), ("b", "b2"), ("b", "b3"), ("b", "b1"), ("b", "b1")).toDF("col1", "col2")
df.groupBy("col1").agg(distudaf($"col2").as("DV")).show
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 22.0 failed 4 times, most recent failure: Lost task 1.3 in stage 22.0 (TID 100, 10.91.252.25): java.lang.ClassCastException: scala.collection.immutable.Map$EmptyMap$ cannot be cast to scala.collection.mutable.Map
我的偏好是改变Map,因为我希望Map是巨大的,并且复制和重新分配可能会导致性能/内存瓶颈)
答案 0 :(得分:4)
我对UDAF的有限理解是你应该只设置你想要的(语义上)更新的内容,即采取MutableAggregationBuffer
中已经设置的内容,并结合你想要的内容添加并... =
它(将在封面下调用update(i: Int, value: Any): Unit
)
您的代码可能如下所示:
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val newBuffer0 = buffer(0).asInstanceOf[Map[Double, Int]]
buffer(0) = newBuffer0 + (input.getDouble(0) -> input.getInt(1))
val newBuffer1 = buffer(1).asInstanceOf[Map[Double, Int]]
buffer(1) = newBuffer1 + (input.getDouble(0) * 10 -> input.getInt(1) * 10)
}
完整的DummyUDAF
可以如下:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{Row, Column}
object DummyUDAF extends UserDefinedAggregateFunction {
def inputSchema = new StructType().add("x", DoubleType).add("y", IntegerType)
def bufferSchema = new StructType()
.add("buff", MapType(DoubleType, IntegerType))
.add("buff2", MapType(DoubleType, IntegerType))
def dataType = new StructType()
.add("xs", MapType(DoubleType, IntegerType))
.add("ys", MapType(DoubleType, IntegerType))
def deterministic = true
def initialize(buffer: MutableAggregationBuffer) = {
buffer(0) = Map[Double,Int]()
buffer(1) = Map[Double,Int]()
}
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val newBuffer0 = buffer(0).asInstanceOf[Map[Double, Int]]
buffer(0) = newBuffer0 + (input.getDouble(0) -> input.getInt(1))
val newBuffer1 = buffer(1).asInstanceOf[Map[Double, Int]]
buffer(1) = newBuffer1 + (input.getDouble(0) * 10 -> input.getInt(1) * 10)
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
buffer1(0) = buffer1(0).asInstanceOf[Map[Double,Int]] ++ buffer2(0).asInstanceOf[Map[Double,Int]]
buffer1(1) = buffer1(1).asInstanceOf[Map[Double,Int]] ++ buffer2(1).asInstanceOf[Map[Double,Int]]
}
//def evaluate(buffer: Row) = (Map(1.0->10,2.0->20), Map(10.0->100,11.0->110))
def evaluate(buffer: Row) = (buffer(0).asInstanceOf[Map[Double,Int]], buffer(1).asInstanceOf[Map[Double,Int]])
}
答案 1 :(得分:0)
聚会晚会。我刚刚发现一个人可以使用
override def bufferSchema: StructType = StructType(List(
StructField("map", ObjectType(classOf[mutable.Map[String, Long]]))
))
在缓冲区中使用mutable.Map
。