Spark结构化流式ForEach和触发时间已处理x2

时间:2018-08-01 08:06:50

标签: apache-spark apache-kafka apache-spark-sql spark-streaming

大家好,我有一个问题。如何为foreach接收器正确设置writestream以仅处理一次来自kafka的数据?它处理来自kafka的数据2次。一次将数据发送到kafka,第二次触发。如何解决它,使其仅在触发时处理一次数据?谢谢您的帮助。

def countResponseCodes(df: DataFrame): StreamingQuery = {
df.selectExpr("CAST(value AS STRING)")
  .as[String]
  .map(x => new Log().parseLog(x))
  .withWatermark("timestamp", "5 minutes")
  .groupBy(col("responseCode"), window(col("timestamp"), "5 minutes"))
  .agg(count(col("responseCode")).as("count"))
  .select("responseCode", "count")
  .writeStream
  .trigger(Trigger.ProcessingTime("5 minutes"))
  .foreach(S3.forEachWriterForResponseCodes())
  .outputMode("update")
  .start()}


  new ForeachWriter[Row] {
  private val sb = new StringBuilder()
  private val metadata = new ObjectMetadata()

  override def open(partitionId: Long, version: Long) = true
  override def process(value: Row)= sb.append(s"""{\"responseCode\":${value(0)},\"responseCount\":${value(1)}}""")
  override def close(errorOrNull: Throwable): Unit = {
    val stream = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))
    metadata.setCacheControl("application/json")
    metadata.setContentLength(sb.length)
    if (sb.nonEmpty)
      s3client.putObject(s"$s3/files", "countResponseCodes.json", stream, metadata)
    sb.setLength(0)
  }
}

0 个答案:

没有答案