大家好,我有一个问题。如何为foreach接收器正确设置writestream以仅处理一次来自kafka的数据?它处理来自kafka的数据2次。一次将数据发送到kafka,第二次触发。如何解决它,使其仅在触发时处理一次数据?谢谢您的帮助。
def countResponseCodes(df: DataFrame): StreamingQuery = {
df.selectExpr("CAST(value AS STRING)")
.as[String]
.map(x => new Log().parseLog(x))
.withWatermark("timestamp", "5 minutes")
.groupBy(col("responseCode"), window(col("timestamp"), "5 minutes"))
.agg(count(col("responseCode")).as("count"))
.select("responseCode", "count")
.writeStream
.trigger(Trigger.ProcessingTime("5 minutes"))
.foreach(S3.forEachWriterForResponseCodes())
.outputMode("update")
.start()}
new ForeachWriter[Row] {
private val sb = new StringBuilder()
private val metadata = new ObjectMetadata()
override def open(partitionId: Long, version: Long) = true
override def process(value: Row)= sb.append(s"""{\"responseCode\":${value(0)},\"responseCount\":${value(1)}}""")
override def close(errorOrNull: Throwable): Unit = {
val stream = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))
metadata.setCacheControl("application/json")
metadata.setContentLength(sb.length)
if (sb.nonEmpty)
s3client.putObject(s"$s3/files", "countResponseCodes.json", stream, metadata)
sb.setLength(0)
}
}