在下面的摘要中,第二次聚合失败(不足为奇):
java.lang.ClassCastException:org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema无法转换为spark_test.Record
package spark_test
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{DataFrame, Encoder, Encoders, SparkSession}
import org.scalatest.FunSuite
case class Record(k1: String, k2: String, v: Long) extends Serializable
class MyAggregator extends Aggregator[Record, Long, Long] {
override def zero: Long = 0
override def reduce(b: Long, a: Record): Long = a.v + b
override def merge(b1: Long, b2: Long): Long = b1 + b2
override def finish(reduction: Long): Long = reduction
override def bufferEncoder: Encoder[Long] = Encoders.scalaLong
override def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
class TypeSafeAggTest extends FunSuite {
lazy val spark: SparkSession = {
SparkSession
.builder()
.master("local")
.appName("spark test")
.getOrCreate()
}
test("agg flow") {
import spark.sqlContext.implicits._
val df: DataFrame = Seq(
("a", "b", 1),
("a", "b", 1),
("c", "d", 1)
).toDF("k1", "k2", "v")
val aggregator = new MyAggregator()
.toColumn.name("output")
df.as[Record]
.groupByKey(_.k1)
.agg(aggregator)
.show(truncate = false) // < --- works #######
df.as[Record]
.groupBy($"k1", $"k2")
.agg(aggregator)
.show(truncate = false) // < --- fails runtime #######
}
}
官方文档中有一个非常简单的示例页面,但并未涵盖使用带类型分组的类型安全聚合器(因此尚不清楚是否支持这种情况)。
使用Spark类型安全的聚合器时,是否可以通过多个键进行分组?
答案 0 :(得分:1)
请使用以下结构:
.groupByKey(v=> (v.k1,v.k2))