Question

当涉及延迟事件时，Spark水印功能非常方便。但我不知道如何处理从现场的多个设备生成流的场景，而某些设备我报告事件的时间晚了。如果我们应用水印，则eventTime水印将针对所有事件而不是针对groupBy字段保持火花。所以spark会放弃所有来自正在运行（同步）的设备的事件。处理这种情况的最佳方法是什么？我已经从spark结构化流媒体中修改了字数统计程序，以证明这个问题。

import java.sql.Timestamp

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}



case class DeviceData(deviceId:String, value:Double, userId:String, timestamp:Timestamp)
object StructuredNetworkWordCountWindowed {

  def main(args: Array[String]) {
    if (args.length < 3) {
      System.err.println("Usage: StructuredNetworkWordCountWindowed <hostname> <port>" +
        " <window duration in seconds> [<slide duration in seconds>]")
      System.exit(1)
    }

    val host = args(0)
    val port = args(1).toInt
    val windowSize = args(2).toInt
    val slideSize = if (args.length == 3) windowSize else args(3).toInt
    if (slideSize > windowSize) {
      System.err.println("<slide duration> must be less than or equal to <window duration>")
    }
    val windowDuration = s"$windowSize seconds"
    val slideDuration = s"$slideSize seconds"

    val spark = SparkSession
      .builder
      .appName("StructuredNetworkWordCountWindowed")
      .master("local[*]")
      .getOrCreate()

    import spark.implicits._


    // Create DataFrame representing the stream of input lines from connection to host:port
    val lines = spark.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val deviceDF:DataFrame = lines.as[String].map(_.split(",")).
      map(value=>DeviceData(value(0), value(1).toDouble, value(2), new Timestamp(value(3).toLong))).toDF()


    // Group the data by window and deviceId and compute the count of each group
    val windowedCounts = deviceDF
      .withWatermark("timestamp", "2 minutes")
      .groupBy(window($"timestamp", windowDuration, slideDuration), $"deviceId")
      .count()

    val query = windowedCounts.writeStream
      .outputMode("append")
      .format("console")
      .option("truncate", "false")
      .start()

    query.awaitTermination()
  }
}

如果device1几乎接近实时同步，而device2滞后5分钟，那么程序将完全忽略来自device2的事件。有没有办法对groupBy函数应用水印而不是整体保留它？

如何在spark中处理每组的延迟事件

0 个答案: