我正在使用spark2.3.1和scala2.12,请参阅此页面: https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala
首先,这是employee.json文件:
{"name":"Michael", "salary":3000}
{"name":"Andy", "salary":4500}
{"name":"Justin", "salary":3500}
{"name":"Berta", "salary":4000}
然后使用标量代码:https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/sql/UserDefinedTypedAggregation.scala 我在这里压缩代码行:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.expressions.Aggregator
object sqlAggregation extends App{
val conf = new SparkConf().setAppName("spark sql")
.set("spark.sql.warehouse.dir", System.getProperty("user.dir"))
.setMaster("local[4]");
val spark = SparkSession
.builder()
.config(conf)
.getOrCreate()
case class Employee(name: String, salary: Long)
case class Average(var sum: Long, var count: Long)
object MyAverage extends Aggregator[Employee, Average, Double] {
// A zero value for this aggregation. Should satisfy the property that any b + zero = b
def zero: Average = Average(0L, 0L)
def reduce(buffer: Average, employee: Employee): Average = {
buffer.sum += employee.salary
buffer.count += 1
buffer
}
def merge(b1: Average, b2: Average): Average = {
b1.sum += b2.sum
b1.count += b2.count
b1
}
def finish(reduction: Average): Double = reduction.sum.toDouble / reduction.count
def bufferEncoder: Encoder[Average] = Encoders.product
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
import spark.implicits._
val ds = spark.read.json("employees.json").as[Employee]
ds.show()
// Convert the function to a `TypedColumn` and give it a name
val averageSalary = MyAverage.toColumn.name("average_salary")
val result = ds.select(averageSalary)
result.show()
}
但是在提交火花时,它给出以下异常:
18/10/24 19:10:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+-------+------+
| name|salary|
+-------+------+
|Michael| 3000|
| Andy| 4500|
| Justin| 3500|
| Berta| 4000|
+-------+------+
Exception in thread "main" java.lang.NoSuchMethodError: org.apache.spark.sql.Dataset.select(Lorg/apache/spark/sql/TypedColumn;Lorg/apache/spark/sql/Encoder;)Lorg/apache/spark/sql/Dataset;
at sqlAggregation$.delayedEndpoint$sqlAggregation$1(sqlAggregation.scala:51)
at sqlAggregation$delayedInit$body.apply(sqlAggregation.scala:5)
at scala.Function0$class.apply$mcV$sp(Function0.scala:34)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.App$$anonfun$main$1.apply(App.scala:76)
at scala.collection.immutable.List.foreach(List.scala:381)
... ...
当我尝试官方的Spark示例时,不知道真正的问题在哪里。有什么要解决的吗?