我使用Apache Spark处理数据库或文件中的大量行。部分处理创建了一个3行的滑动窗口,其中行需要展平,并在展平的行上执行其他计算。以下是尝试完成的简化示例。
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.expressions.Window
object Main extends App {
val ss = SparkSession.builder().appName("DataSet Test")
.master("local[*]").getOrCreate()
import ss.implicits._
case class Foo(a:Int, b:String )
// rows from database or file
val foos = Seq(Foo(-18, "Z"),
Foo(-11, "G"),
Foo(-8, "A"),
Foo(-4, "C"),
Foo(-1, "F")).toDS()
// work on 3 rows
val sliding_window_spec = Window.orderBy(desc("a")).rowsBetween( -2, 0)
// flattened object with example computations
case class FooResult(a1:Int, b1:String, a2:Int, b2:String, a3:Int, b3:String, computation1:Int, computation2:String )
// how to convert foo to fooResult???
// flatten 3 rows into 1 and do additional computations on flattened rows
// expected results
val fooResults = Seq(FooResult( -1, "F", -4, "C", -8, "A", -5, "FCA" ),
FooResult( -4, "C", -8, "A", -11, "G", -12, "CAG" ),
FooResult( -8, "A", -11, "G", -18, "Z", -19, "AGZ" )).toDS()
ss.stop()
}
如何将foos转换为fooResults?我正在使用Apache Spark 2.3.0
答案 0 :(得分:2)
// how to convert foo to fooResult??? // flatten 3 rows into 1 and do additional computations on flattened rows
您可以使用您已定义的collect_list
函数 ,然后通过定义{{1}来简单地使用window
内置函数函数,你可以做计算部分和展平部分。最后,您可udf
和扩展filter
列,以获得最终所需结果
struct
应该给你
def slidingUdf = udf((list1: Seq[Int], list2:Seq[String])=> {
if(list1.size < 3) null
else {
val zipped = list1.zip(list2)
FooResult(zipped(0)._1, zipped(0)._2, zipped(1)._1, zipped(1)._2, zipped(2)._1, zipped(2)._2, zipped(0)._1+zipped(1)._1, zipped(0)._2+zipped(1)._2+zipped(2)._2)
}
})
foos.select(slidingUdf(collect_list("a").over(sliding_window_spec), collect_list("b").over(sliding_window_spec)).as("test"))
.filter(col("test").isNotNull)
.select(col("test.*"))
.show(false)
注意:请记住,案例类应在当前会话范围之外定义