我正在使用spark sql优化器来优化两个范围的连接,它被优化为计算两个范围的交点以避免连接
test("SparkTest") {
object RangeIntersectRule extends Rule[LogicalPlan] {
override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
case Join(Project(_, Range(start1, end1, _, _, _, _)), Project(_, Range(start2, end2, _, _, _, _)), _, _) => {
val start = start1 max start2
val end = end1 min end2
if (start1 > end2 || end1 < start2) Range(0, 0, 1, Some(1), false) else Range(start, end, 1, Some(1), false)
}
}
}
val spark = SparkSession.builder().master("local").appName("SparkTest").enableHiveSupport().getOrCreate()
spark.experimental.extraOptimizations = Seq(RangeIntersectRule)
spark.range(10, 40).toDF("x").createOrReplaceTempView("t1")
spark.range(20, 50).toDF("y").createOrReplaceTempView("t2")
val df = spark.sql("select t1.x from t1 join t2 on t1.x = t2.y")
df.explain(true)
df.show(truncate = false)
}
但是当我运行它时,会引发异常,有人可以帮忙解决问题所在吗?谢谢
优化的逻辑计划和物理计划是:
== Optimized Logical Plan ==
Project [x#2L]
+- !Project [id#0L AS x#2L]
+- Range (20, 40, step=1, splits=Some(1))
== Physical Plan ==
Project [x#2L]
+- !Project [id#0L AS x#2L]
+- Range (20, 40, step=1, splits=1)
例外是:
Caused by: java.lang.RuntimeException: Couldn't find id#0L in [id#14L]
at scala.sys.package$.error(package.scala:27)
at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:106)
at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:100)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:59)
... 47 more
答案 0 :(得分:0)
object RangeIntersectRule extends Rule[LogicalPlan] {
override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
case Join(Range(start1, end1, 1, Some(1), output1, false), Range(start2, end2, 1, Some(1), output2, false), Inner, _) => {
val start = start1 max start2
val end = end1 min end2
if (start1 > end2 || end1 < start2) Project(output1, Range(0, 0, 1, Some(1), output1, false))
else Project(output1, Range(start, end, 1, Some(1), output1, false))
}
}
}