org.apache.spark.SparkException:作业由于阶段故障而中止:任务不可序列化

时间:2018-09-06 12:35:36

标签: scala apache-spark user-defined-functions

在执行apache spark示例时,出现以下错误

  

org.apache.spark.SparkException:作业由于阶段故障而中止:任务不可序列化:java.io.NotSerializableException:org.apache.spark.sql.TypedColumn

执行最后一行时出现错误。

import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator

case class Employee(name: String, salary: Long)
case class Average(var sum: Long, var count: Long)

object MyAverage extends Aggregator [Employee, Average, Double] {

    def zero: Average = Average(0L,0L)

    def reduce (buffer: Average, employee: Employee): Average = {

          buffer.sum += buffer.sum + employee.salary
          buffer.count += buffer.count + 1
          buffer
    }

    def merge (b1: Average, b2: Average): Average = {

      b1.sum = b1.sum + b2.sum
      b1.count = b1.count + b2.count
      b1  
    }

    def finish (reduction: Average) : Double = reduction.sum/reduction.count

    def bufferEncoder: Encoder[Average] = Encoders.product
    def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}

val ds = spark.read.json("FileStore/tables/employee.json").as[Employee] 
ds.show()

val averageSalary= MyAverage.toColumn.name("average_salary")
val result = ds.select(averageSalary)
result.show()

0 个答案:

没有答案