MS SQL QUERY与SPARK SQL返回的总行差

时间:2018-12-05 06:21:05

标签: sql-server apache-spark apache-spark-sql

SQL查询

SELECT     
    a.AcctBranchName, 
    c.CustomerNum,
    c.SourceCustomerId,
    a.SourceAccountId, 
    a.AccountNum, 
    c.FullName,
    c.LastName,
    c.BirthDate, 
    a.Balance,
case when [RollOverStatus] = 'Y' then 'Yes' Else 'No' end as RollOverStatus
FROM         
     v_Account AS a left join v_Customer AS c 
    ON c.CustomerID = a.CustomerID AND c.Businessdate = a.Businessdate
WHERE     
    a.Category = 'Deposit' AND
    c.Businessdate= '2018-11-28'  AND
    isnull(a.Classification,'N/A') IN ('Contractual Account','Non-Term Deposit','Term Deposit')
      AND IsActive = 'Yes'

我的代码散发

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

object ListOfSavingFiltered {
  def merge(srcPath: String, dstPath: String): Unit =  {
    val hadoopConfig = new Configuration()
    val hdfs = FileSystem.get(hadoopConfig)
    FileUtil.copyMerge(hdfs, new Path(srcPath), hdfs, new Path(dstPath), false, hadoopConfig, null)
    // the "true" setting deletes the source files once they are merged into the new output
  }
  def main(args: Array[String]): Unit = {

    val url = "jdbc:sqlserver://localhost;databaseName=InsightWarehouse;integratedSecurity=true";
    val driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"

    val v_Account = "dbo.v_Account"
    val v_Customer = "dbo.v_Customer"

    val spark = SparkSession.
      builder.master("local[*]")
      //.config("spark.debug.maxToStringFields", "100")
      .appName("Insight Application Big Data")
      .getOrCreate()



    val dfAccount = spark
      .read
      .format("jdbc")
      .option("url", url)
      .option("driver", driver)
      .option("dbtable",v_Account)
      .load()

    val dfCustomer = spark
      .read
      .format("jdbc")
      .option("url", url)
      .option("driver", driver)
      .option("dbtable",v_Customer)
      .load()

    //dfAccount.printSchema()
    val Classification = Seq("Contractual Account","Non-Term Deposit","Term Deposit")
    val joined = dfAccount.as("a")
              .join(dfCustomer.as("c"),
                     dfAccount.col("BusinessDate").equalTo(dfCustomer.col("BusinessDate"))
                    && dfCustomer.col("CustomerID").equalTo(dfAccount.col("CustomerID"))

                    && dfAccount.col("BusinessDate")==="2018-11-28"
                       && dfAccount.col("Category")==="Deposit"
                       && dfAccount.col("IsActive").equalTo("Yes")
                    && dfAccount.col("Classification").isin(Classification:_*)
                  ,"left_outer")

    //joined.show()
    //val columns = Seq[String]()

    val outputfile = "src/main/resources/out/"
    var filename = "lifOfSaving.csv.gz"
    var outputFileName = outputfile + "/temp_" + filename
    var mergedFileName = outputfile + "/merged_" + filename
    var mergeFindGlob  = outputFileName
    System.out.println("=== Print out schema ===")

    val responseWithSelectedColumns = joined.select(
      "a.AcctBranchName",
      "c.CustomerNum",
      "c.SourceCustomerId",
      "a.SourceAccountId",
      "a.AccountNum",
      "c.FullName",
      "c.LastName",
      "c.BirthDate",
      "a.Balance",
      "RollOverStatus"
    ).withColumn("RollOverStatus",when(col("RollOverStatus").equalTo("Y"),"Yes").otherwise("No"))

    responseWithSelectedColumns
      // .coalesce(1) //So just a single part- file will be created
      .repartition(4)
      .write.mode("overwrite")
      .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
      .format("com.databricks.spark.csv")
      .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") //Avoid creating of crc files
      .option("header","true") //Write the header

      .save(outputFileName)
    merge(mergeFindGlob, mergedFileName )
    responseWithSelectedColumns.unpersist()

    spark.stop()
  }
}

SQL返回了230607行 但SPARK返回7152395

如果与SQL查询相比,火花代码有什么问题?请指教

1 个答案:

答案 0 :(得分:2)

您可以检查

val joined = dfAccount.as("a")
              .join(dfCustomer.as("c"),
                     Seq("BusinessDate","CustomerID"),"LEFT")
                    .filter(
                    dfAccount.col("BusinessDate")==="2018-11-28"
                       && dfAccount.col("Category")==="Deposit"
                       && dfAccount.col("IsActive").equalTo("Yes")
                    && dfAccount.col("Classification").isin(Classification:_*))

此外,如果您不想在Spark中加载完整的数据并在sql层进行处理,则可以使用以下代码。

val query = (SELECT     
    a.AcctBranchName, 
    c.CustomerNum,
    c.SourceCustomerId,
    a.SourceAccountId, 
    a.AccountNum, 
    c.FullName,
    c.LastName,
    c.BirthDate, 
    a.Balance,
case when [RollOverStatus] = 'Y' then 'Yes' Else 'No' end as RollOverStatus
FROM         
     v_Account AS a left join v_Customer AS c 
    ON c.CustomerID = a.CustomerID AND c.Businessdate = a.Businessdate
WHERE     
    a.Category = 'Deposit' AND
    c.Businessdate= '2018-11-28'  AND
    isnull(a.Classification,'N/A') IN ('Contractual Account','Non-Term Deposit','Term Deposit')
      AND IsActive = 'Yes' ) tmp


val joined= spark
      .read
      .format("jdbc")
      .option("url", url)
      .option("driver", driver)
      .option("dbtable",query)
      .load()