逻辑计划到DataFrame / Dataset Apache spark

时间:2016-08-11 16:28:49

标签: apache-spark apache-spark-dataset

我正在使用spark 2.0.0。这是我的代码:

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

object WikiDataframe {

  def getDataframe(sparkSession: SparkSession): DataFrame = {

    val df = sparkSession.read.option("header", "true").option("inferSchema", "true").csv(FILE_LOCATION)

    df.registerTempTable("pageviews_by_second")

    df
  }

  def main(args: Array[String]) {
    val sparkSession = SparkSession
      .builder()
      .appName("Spark SQL Example")
      .master("local")
      .getOrCreate()

    val pageViewsDF = WikiDataframe.getDataframe(sparkSession)

    val query: DataFrame = sparkSession.sql("select Date from (select * from pageviews_by_second ) a")

    var logicalQuery: LogicalPlan = query.queryExecution.logical

    println("logicalQuery : " + logicalQuery);

    import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases

    println("Eliminating sub queries");

    logicalQuery = EliminateSubqueryAliases.apply(logicalQuery)

  }
}

我坚持执行logicalQuery。如果可能,我想获得dataframedataset。 任何帮助将不胜感激

1 个答案:

答案 0 :(得分:0)

你需要在org.apache.spark.sql写一个类,并且有类似下面的内容

def apply(Sqlctx: SparkSession, Plan: LogicalPlan): DataFrame = {
  Dataset.ofRows(Sqlctx, Plan)
}