我有一个包含三列的数据框。我必须根据groupBy条件对数据帧的最近k个记录执行映射。这是我写的代码,它抛出NotSerializable异常数据框如下所示我必须使用城市对它们进行分组,并且必须计算每个城市的最后两个记录的值。
<pre><code>
+------------+-----+-----+
| cities|temps|preas|
+------------+-----+-----+
| Hyderabad| 30| 3|
|Secunderabad| 38| 3|
| Mumbai| 41| 4|
| Chennai| 39| 3|
| Mumbai| 25| 2|
| Chennai| 21| 2|
| Hyderabad| 36| 3|
|secunderabad| 32| 3|
| Chennai| 26| 2|
| Mumbai| 45| 4|
| Chennai| 20| 2|
| Hyderabad| 39| 3|
|Secunderabad| 42| 4|
+------------+-----+-----+
` import org.apache.spark.sql.expressions.Window
val k = 2
@transient val w = Window.partitionBy("cities")
val coll = List("cities","temps","preas")
val cl = coll.map(a => col(a))
val colls = cl :+ row_number.over(w).as("row_number")
@transient val nums = df.select(colls:_*)
val limits
=nums.groupBy("cities").agg(max($"row_number").minus(k).as("limit"))
val data = nums.join(limits, "cities").filter($"row_number".gt($"limit"))
val
dfd=data.withColumn("row_number",$"row_number".minus($"limit")).select(cl:_*)
val dfRDD = dfd.rdd
import org.apache.spark.sql.Row
val weights = List(1,2)
val func = (it:Iterable[Row]) => {
val lis = it.toList
var res :List[Int] = List()
val len = lis(0).size
for (i<-1 until len){
res = res:+lis.map(a=>a(i)).map(b=>b.asInstanceOf[Int]).zip(weights).map(c=>c._1*c._2).sum/(weights.reduce(_+_))
}
res
}
val outp = dfRDD.groupBy(row => row(0)).map{case(k,v)=>(k,func(v))`