将新行与spark scala中的数据框中的前一行数据合并

时间:2018-04-12 11:18:58

标签: scala apache-spark spark-dataframe

输入Spark Dataframe df(OLTP):

+----+---------+------+
|name|date     |amount|
+----+---------+------+
|abc |4/6/2018 |  100 |
|abc |4/6/2018 |  200 |
+----+---------+------+
|abc |4/13/2018|  300 |
+----+---------+------+

预期DF(OLAP):

+----+---------+------+
|name|date     |amount|
+----+---------+------+
|abc |4/6/2018 |  100 |
|abc |4/6/2018 |  200 |
+----+---------+------+
|abc |4/13/2018|   100|
+----+---------+------+
| abc|4/13/2018|   200|
+----+---------+------+
| abc|4/13/2018|   300|

+----+---------+------+

我的代码

val df = df1.union(df1) 

+----+---------+------+
|name|date     |amount|
+----+---------+------+
|abc |4/6/2018 |100   |
|abc |4/6/2018 |200   |
|abc |4/13/2018|300   |
|abc |4/6/2018 |100   |
|abc |4/6/2018 |200   |
|abc |4/13/2018|300   |
+----+---------+------+


 val w1 = org.apache.spark.sql.expressions.Window.orderBy("date")
    val ExpectedDF = df.withColumn("previousAmount",  lag("amount",1).over(w1)).withColumn("newdate", lag("date",1).over(w1))
    ExpectedDF .show(false)

+----+---------+------+--------------+---------+
|name|date     |amount|previousAmount|newdate  |
+----+---------+------+--------------+---------+
|abc |4/13/2018|300   |null          |null     |
|abc |4/13/2018|300   |300           |4/13/2018|
|abc |4/6/2018 |100   |300           |4/13/2018|
|abc |4/6/2018 |200   |100           |4/6/2018 |
|abc |4/6/2018 |100   |200           |4/6/2018 |
|abc |4/6/2018 |200   |100           |4/6/2018 |
+----+---------+------+--------------+---------+

1 个答案:

答案 0 :(得分:-1)

def main(args: Array[String]){
    val conf = new SparkConf().setAppName("Excel-read-write").setMaster("local")
    val sc = new SparkContext(conf)
    val sqlc = new org.apache.spark.sql.SQLContext(sc)
    val ss = SparkSession.builder().master("local").appName("Excel-read-write").getOrCreate()
    import ss.sqlContext.implicits._
    var df1 = sqlc.read.format("com.databricks.spark.csv")
             .option("header", "true")
             .option("inferSchema", "true")
             .load("oldRecords.csv")
    df1.show(false)
    println("---- df1 row count ----"+df1.count())
    if(df1.count()>0){
      for (i <- 0 until (df1.count().toInt)-1) {
        var df2 = df1.unionAll(df1)//.union(df1)//df3
        //df2.show(false)
        var w1 = org.apache.spark.sql.expressions.Window.orderBy("date")
        var df3 = df2.withColumn("previousAmount",  lag("amount",1).over(w1)).withColumn("newdate", lag("date",1).over(w1))
        // df3.show(false)
        var df4 = df3.filter((df3.col("newdate").isNotNull))//(df3.col("new_date").isNotNull)
        //df4.show(false)
        var df5 = df4.select("name","amount","newdate").distinct() 
        println("-----------"+df5.show(false))
        df1 = df5.withColumnRenamed("newdate", "date")
      }
    }
}