java.io.NotSerializableException:org.apache.spark.sql.Column

时间:2017-05-29 06:05:24

标签: scala apache-spark spark-dataframe

我有一个包含三列的数据框。我必须根据groupBy条件对数据帧的最近k个记录执行映射。这是我写的代码,它抛出NotSerializable异常数据框如下所示我必须使用城市对它们进行分组,并且必须计算每个城市的最后两个记录的值。

<pre><code>
    +------------+-----+-----+
    |      cities|temps|preas|
    +------------+-----+-----+
    |   Hyderabad|   30|    3|
    |Secunderabad|   38|    3|
    |      Mumbai|   41|    4|
    |     Chennai|   39|    3|
    |      Mumbai|   25|    2|
    |     Chennai|   21|    2|
    |   Hyderabad|   36|    3|
    |secunderabad|   32|    3| 
    |     Chennai|   26|    2| 
    |      Mumbai|   45|    4|
    |     Chennai|   20|    2|
    |   Hyderabad|   39|    3|
    |Secunderabad|   42|    4|
    +------------+-----+-----+

`   import org.apache.spark.sql.expressions.Window
    val k = 2
    @transient val w = Window.partitionBy("cities") 
    val coll = List("cities","temps","preas")
    val cl = coll.map(a => col(a))
    val colls = cl :+ row_number.over(w).as("row_number")
    @transient val nums = df.select(colls:_*)
    val limits 
    =nums.groupBy("cities").agg(max($"row_number").minus(k).as("limit"))
    val data = nums.join(limits, "cities").filter($"row_number".gt($"limit"))
    val 
    dfd=data.withColumn("row_number",$"row_number".minus($"limit")).select(cl:_*)
    val dfRDD = dfd.rdd
    import org.apache.spark.sql.Row
    val weights = List(1,2)
    val func = (it:Iterable[Row]) => {
        val lis = it.toList
        var res :List[Int] = List()
        val len = lis(0).size
        for (i<-1 until len){
          res = res:+lis.map(a=>a(i)).map(b=>b.asInstanceOf[Int]).zip(weights).map(c=>c._1*c._2).sum/(weights.reduce(_+_))
        }
        res
    }
    val outp = dfRDD.groupBy(row => row(0)).map{case(k,v)=>(k,func(v))`

0 个答案:

没有答案