使用Scala 2.11的Spark 2.3.0。我正在尝试编写自定义聚合器并在每个these docs的窗口函数上运行它,但我在标题中收到错误。这是一个精简的例子。这是一个FunSuite测试。
我知道错误消息说要提交错误报告,但这是一个如此简单的示例,几乎直接从文档中提取,我想知道我的代码中是否存在导致错误的内容。我想知道是否使用集合类型作为缓冲区以某种方式不支持或不寻常。我没有在Stack Overflow上发现任何其他有关此错误的问题。
package com.foobar;
import com.holdenkarau.spark.testing._
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.{Aggregator, Window}
import org.scalatest.FunSuite
import org.slf4j.LoggerFactory
import scala.collection.mutable.ListBuffer
case class Foo(ID:Long, RANK:Int)
class Example extends FunSuite with DataFrameSuiteBase {
test("example") {
val data = Seq(
Foo(5555, 1),
Foo(5555, 2),
Foo(8888, 1),
Foo(8888, 2)
)
import spark.implicits._
val df = sc.parallelize(data).toDF
val w = Window.partitionBy("ID").orderBy("RANK")
// The three types are: input, buffer, and output.
object AggregateFoos extends Aggregator[Foo, ListBuffer[Foo], Boolean] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
override def zero: ListBuffer[Foo] = new ListBuffer[Foo]()
override def reduce(b: ListBuffer[Foo], a: Foo): ListBuffer[Foo] = {
b += a
}
// Merge two intermediate values
override def merge(b1: ListBuffer[Foo], b2: ListBuffer[Foo]): ListBuffer[Foo] = {
(b1 ++ b2).sortBy(b => b.RANK)
}
// Transform the output of the reduction
override def finish(reduction: ListBuffer[Foo]): Boolean = {
true // in real life there would be logic here
}
// Specifies the Encoder for the intermediate value type
override def bufferEncoder: Encoder[ListBuffer[Foo]] = {
ExpressionEncoder()
}
// Specifies the Encoder for the final output value type
override def outputEncoder: Encoder[Boolean] = {
ExpressionEncoder()
}
}
val agg = AggregateFoos.toColumn.name("agg")
df.select(df.col("*"), agg.over(w).as("agg")).show(false)
}
}
以下是错误消息:
org.apache.spark.sql.AnalysisException:由于数据类型不匹配,无法解析'(PARTITION BY
ID
ORDER BYRANK
ASC NULLS FIRST unspecifiedframe $())':无法使用UnspecifiedFrame 。这应该在分析期间进行转换。请提交错误报告。;;
以下是完整的例外情况。
org.apache.spark.sql.AnalysisException: cannot resolve '(PARTITION BY `ID` ORDER BY `RANK` ASC NULLS FIRST unspecifiedframe$())' due to data type mismatch: Cannot use an UnspecifiedFrame. This should have been converted during analysis. Please file a bug report.;;
'Aggregate [ID#2L, RANK#3, aggregatefoos(AggregateFoos$@592e7718, None, None, None, mapobjects(MapObjects_loopValue0, MapObjects_loopIsNull0, ObjectType(class Foo), if (isnull(lambdavariable(MapObjects_loopValue0, MapObjects_loopIsNull0, ObjectType(class Foo), true))) null else named_struct(ID, assertnotnull(lambdavariable(MapObjects_loopValue0, MapObjects_loopIsNull0, ObjectType(class Foo), true)).ID, RANK, assertnotnull(lambdavariable(MapObjects_loopValue0, MapObjects_loopIsNull0, ObjectType(class Foo), true)).RANK), input[0, scala.collection.mutable.ListBuffer, true], None) AS value#9, mapobjects(MapObjects_loopValue1, MapObjects_loopIsNull1, StructField(ID,LongType,false), StructField(RANK,IntegerType,false), if (isnull(lambdavariable(MapObjects_loopValue1, MapObjects_loopIsNull1, StructField(ID,LongType,false), StructField(RANK,IntegerType,false), true))) null else newInstance(class Foo), input[0, array<struct<ID:bigint,RANK:int>>, true], Some(class scala.collection.mutable.ListBuffer)), input[0, boolean, false] AS value#8, BooleanType, false, 0, 0) windowspecdefinition(ID#2L, RANK#3 ASC NULLS FIRST, unspecifiedframe$()) AS agg#14]
+- AnalysisBarrier
+- LocalRelation [ID#2L, RANK#3]
at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:93)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:85)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:288)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:95)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:95)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:107)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:107)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:106)
at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:118)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$1.apply(QueryPlan.scala:122)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:122)
at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:127)
at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:127)
at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:95)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:85)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:80)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:80)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:92)
at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)
at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)
at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3296)
at org.apache.spark.sql.Dataset.select(Dataset.scala:1307)
... 49 elided
据我所知,这是某种内部Spark错误,我感到很茫然,感谢任何帮助。