使用spark sql优化器优化两个范围的连接

时间:2018-10-23 23:55:51

标签: apache-spark

我正在使用spark sql优化器来优化两个范围的连接,它被优化为计算两个范围的交点以避免连接

test("SparkTest") {
    object RangeIntersectRule extends Rule[LogicalPlan] {
      override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
        case Join(Project(_, Range(start1, end1, _, _, _, _)), Project(_, Range(start2, end2, _, _, _, _)), _, _) => {
          val start = start1 max start2
          val end = end1 min end2
          if (start1 > end2 || end1 < start2) Range(0, 0, 1, Some(1), false) else Range(start, end, 1, Some(1), false)


        }
      }
    }

    val spark = SparkSession.builder().master("local").appName("SparkTest").enableHiveSupport().getOrCreate()
    spark.experimental.extraOptimizations = Seq(RangeIntersectRule)
    spark.range(10, 40).toDF("x").createOrReplaceTempView("t1")
    spark.range(20, 50).toDF("y").createOrReplaceTempView("t2")
    val df = spark.sql("select t1.x from t1 join t2 on t1.x = t2.y")
    df.explain(true)
    df.show(truncate = false)
  }

但是当我运行它时,会引发异常,有人可以帮忙解决问题所在吗?谢谢

优化的逻辑计划和物理计划是:

== Optimized Logical Plan ==
Project [x#2L]
+- !Project [id#0L AS x#2L]
   +- Range (20, 40, step=1, splits=Some(1))

== Physical Plan ==
Project [x#2L]
+- !Project [id#0L AS x#2L]
   +- Range (20, 40, step=1, splits=1)

例外是:

Caused by: java.lang.RuntimeException: Couldn't find id#0L in [id#14L]
    at scala.sys.package$.error(package.scala:27)
    at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:106)
    at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:100)
    at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:59)
    ... 47 more

1 个答案:

答案 0 :(得分:0)

object RangeIntersectRule extends Rule[LogicalPlan] {
  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
    case Join(Range(start1, end1, 1, Some(1), output1, false), Range(start2, end2, 1, Some(1), output2, false), Inner, _) => {
      val start = start1 max start2
      val end = end1 min end2
      if (start1 > end2 || end1 < start2) Project(output1, Range(0, 0, 1, Some(1), output1, false))
      else Project(output1, Range(start, end, 1, Some(1), output1, false))
    }
  }
}