输入Spark Dataframe df(OLTP):
+----+---------+------+
|name|date |amount|
+----+---------+------+
|abc |4/6/2018 | 100 |
|abc |4/6/2018 | 200 |
+----+---------+------+
|abc |4/13/2018| 300 |
+----+---------+------+
预期DF(OLAP):
+----+---------+------+
|name|date |amount|
+----+---------+------+
|abc |4/6/2018 | 100 |
|abc |4/6/2018 | 200 |
+----+---------+------+
|abc |4/13/2018| 100|
+----+---------+------+
| abc|4/13/2018| 200|
+----+---------+------+
| abc|4/13/2018| 300|
+----+---------+------+
我的代码
val df = df1.union(df1)
+----+---------+------+
|name|date |amount|
+----+---------+------+
|abc |4/6/2018 |100 |
|abc |4/6/2018 |200 |
|abc |4/13/2018|300 |
|abc |4/6/2018 |100 |
|abc |4/6/2018 |200 |
|abc |4/13/2018|300 |
+----+---------+------+
val w1 = org.apache.spark.sql.expressions.Window.orderBy("date")
val ExpectedDF = df.withColumn("previousAmount", lag("amount",1).over(w1)).withColumn("newdate", lag("date",1).over(w1))
ExpectedDF .show(false)
+----+---------+------+--------------+---------+
|name|date |amount|previousAmount|newdate |
+----+---------+------+--------------+---------+
|abc |4/13/2018|300 |null |null |
|abc |4/13/2018|300 |300 |4/13/2018|
|abc |4/6/2018 |100 |300 |4/13/2018|
|abc |4/6/2018 |200 |100 |4/6/2018 |
|abc |4/6/2018 |100 |200 |4/6/2018 |
|abc |4/6/2018 |200 |100 |4/6/2018 |
+----+---------+------+--------------+---------+
答案 0 :(得分:-1)
def main(args: Array[String]){
val conf = new SparkConf().setAppName("Excel-read-write").setMaster("local")
val sc = new SparkContext(conf)
val sqlc = new org.apache.spark.sql.SQLContext(sc)
val ss = SparkSession.builder().master("local").appName("Excel-read-write").getOrCreate()
import ss.sqlContext.implicits._
var df1 = sqlc.read.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("oldRecords.csv")
df1.show(false)
println("---- df1 row count ----"+df1.count())
if(df1.count()>0){
for (i <- 0 until (df1.count().toInt)-1) {
var df2 = df1.unionAll(df1)//.union(df1)//df3
//df2.show(false)
var w1 = org.apache.spark.sql.expressions.Window.orderBy("date")
var df3 = df2.withColumn("previousAmount", lag("amount",1).over(w1)).withColumn("newdate", lag("date",1).over(w1))
// df3.show(false)
var df4 = df3.filter((df3.col("newdate").isNotNull))//(df3.col("new_date").isNotNull)
//df4.show(false)
var df5 = df4.select("name","amount","newdate").distinct()
println("-----------"+df5.show(false))
df1 = df5.withColumnRenamed("newdate", "date")
}
}
}