我正在Scala中尝试火花结构化流工作。该程序从Amazon s3存储桶中读取json文件,其中包含员工的示例数据(一些测试数据)。目标是按员工性别(男/女)获得平均年龄。我能够第一次看到平均数据,但是每当我向存储桶中添加新的json文件时,我都无法获得更新的平均数据,这意味着所有员工的更新平均年龄。 我通过创建另一个类EmployeeDataProcesser来覆盖名为ForeachWriter [org.apache.spark.sql.Row]的类,该类重写open(partitionId:Long,version:Long),process(record:org.apache.spark.sql.Row )并关闭(errorOrNull:可抛出)。 据我观察,方法process()只是第一次被调用,但是当我向s3存储桶添加新的json文件时没有被调用。但是打开和关闭方法总是被调用。
// below class is for EmployeeDataProcesser.scala
class EmployeeDataProcesser() extends ForeachWriter[org.apache.spark.sql.Row] {
def open(partitionId: Long, version: Long): Boolean = {
// open connection
true
}
def process(record: org.apache.spark.sql.Row) = {
println(record)
}
def close(errorOrNull: Throwable): Unit = {
// close the connection
println(s"Close connection")
if(errorOrNull != null)
println(errorOrNull.getMessage)
}
}
以下代码用于名为SparkStructuredStreaming.scala的文件 我在这里隐藏了要导入的软件包,以减小代码大小
class SparkStructuredStreamFromAWSS3Bucket {
def startJob(): Unit = {
System.setProperty("hadoop.home.dir", "C:\\Deep")
System.setProperty("spark.sql.warehouse.dir", "file:///C:/spark-warehouse")
val INPUT_PATH : String="s3://my-bucket-04052019/*"
val spark = SparkSession
.builder
//.config("spark.sql.warehouse.dir", "file:///c:/tmp/spark-warehouse")
.appName("StructuredNetworkWordCount")
.config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
.config("spark.hadoop.fs.s3.awsAccessKeyId", "xxxxxxxxxxxxx")
.config("spark.hadoop.fs.s3.awsSecretAccessKey", "xxxxxxxxxxxxx")
.config("fs.s3.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem")
.master("local[2]")
.getOrCreate()
import spark.implicits._
val personSchema = StructType(
List(
StructField("name",
StructType(
List(
StructField("firstName", StringType, true),
StructField("middleName", StringType, true),
StructField("lastName", StringType, true)
)
), true),
StructField("sex", StringType, true),
StructField("age", IntegerType, true),
StructField("sal",
StructType(
List(
StructField("basic",IntegerType, true),
StructField("hra",IntegerType, true),
StructField("travel",IntegerType, true)
)
), true)
)
)
val personDF = spark
.readStream
.option("multiLine", true)
.option("mode", "PERMISSIVE")
.schema(personSchema)
.json(INPUT_PATH)
val expandData = personDF.select(
"name.firstName",
"name.middleName",
"name.lastName",
"sex",
"age",
"sal.basic",
"sal.hra",
"sal.travel")
expandData.createOrReplaceTempView("employee");
val avgQry:String = "select sex, avg(age) from employee group by sex"
val ageAvg = spark.sql(avgQry)
ageAvg.writeStream
.option("checkpointLocation", "file:///c:/spark-check-point-dir")
.outputMode(OutputMode.Complete())
.foreach(new EmployeeDataProcesser())
.queryName("average age of employee")
.start()
.awaitTermination()
}
}