在结构化流式传输中的writeStream中接收旧窗口和重复窗口

时间:2019-02-19 16:13:13

标签: apache-spark spark-streaming

我有一个结构化的流传输管道,我在其中使用readStream从Kafka进行读取,将groupby应用于窗口时间为1秒的列。由于我的要求是计算当天的滚动计数,因此我实现了Mutable HashMap Accumulator。计算之后,我将记录写回另一个Kafka主题。

输入
我大约在TimeStamp = 1550589354000

在卡夫卡发布了三张唱片
{"Floor_Id":"Shop Floor 1","Timestamp":1550589234000,"HaltRecord":{"HaltReason":"Test1","Severity":"Low","FaultErrorCategory":"Docked","NonFaultErrorCategory":null},"Category":{"Type":"Halt","End_time":1549010152834,"Start_time":1549009072834}}
{"Floor_Id":"Shop Floor 1","Timestamp":1550589294000,"HaltRecord":{"HaltReason":"Test1","Severity":"Low","FaultErrorCategory":"Docked","NonFaultErrorCategory":null},"Category":{"Type":"Halt","End_time":1549010152834,"Start_time":1549009072834}}
{"Floor_Id":"Shop Floor 1","Timestamp":1550589354000,"HaltRecord":{"HaltReason":"Test1","Severity":"Low","FaultErrorCategory":"Docked","NonFaultErrorCategory":null},"Category":{"Type":"Halt","End_time":1549010152834,"Start_time":1549009072834}}

实际输出:

{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 1, "Date_Time" : 1550589295000}}]}
{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 2, "Date_Time" : 1550589235000}}]}
{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 3, "Date_Time" : 1550589295000}}]}
{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 4, "Date_Time" : 1550589355000}}]}
{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 5, "Date_Time" : 1550589235000}}]}

预期输出:

{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 1, "Date_Time" : 1550589235000}}]}
{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 2, "Date_Time" : 1550589295000}}]}
{ "Floor_Id": "Shop Floor 1", "Error_Category" : [ { "Category" : "Test1", "DataPoints" : { "NumberOfErrors": 3, "Date_Time" : 1550589355000}}]}

我看到Window_Time 1550589295000和1550589235000的重复记录。

到目前为止已完成编码:

object ErrorCategory extends App {

  Logger.getLogger("org").setLevel(Level.ERROR)
  Logger.getLogger("akka").setLevel(Level.ERROR)

  val readServer = <host>:<port>
  val readTopic = <topic1>
  val writeServer = <host>:<port>
  val writeTopic = <topic2>

  val spark = SparkSession.builder
    .appName("StreamAssetList")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()

  val hma = new HashMapAccumulator()

  val ssc = new StreamingContext(spark.sparkContext, Seconds(1))

  val streamingInputDf = spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", readServer)
    .option("subscribe", readTopic)
    .load()


  val schema = new StructType()
    .add("Floor_Id", StringType)
    .add("Category",
      new StructType()
        .add("Type", StringType)
        .add("End_time", LongType)
        .add("Start_time", LongType))
    .add("HaltRecord",
      new StructType()
        .add("HaltReason", StringType)
        .add("Severity", StringType)
        .add("FaultErrorCategory", StringType)
        .add("NonFaultErrorCategory", StringType))
    .add("Timestamp", LongType)


  val streamingSelectDF = streamingInputDf.selectExpr("CAST(value AS STRING)")
    .select(from_json(col("value"), schema = schema) as "data")
    .select("data.Floor_Id", "data.Category.Type", "data.Timestamp", "data.HaltRecord.HaltReason")

  println("\n\n\t\t<< Ready to process messages >> \n")
  //  streamingSelectDF.printSchema()

  val filterDF = streamingSelectDF
    .filter(
      streamingSelectDF("Floor_Id") === "Shop Floor 1" &&
        streamingSelectDF("IsError") === "y" && streamingSelectDF("Type") === "Halt")
    .groupBy(streamingSelectDF("Floor_Id"), streamingSelectDF("HaltReason"), window(to_timestamp(from_unixtime(streamingSelectDF("Timestamp")/1000)), "1 second", "1 second"))
    .count().writeStream.foreach(new ForeachWriter[Row] {

    var producer: KafkaProducer[String, String] = _

    override def open(partitionId: Long, version: Long): Boolean = {

      val kafkaProperties = new Properties()

      kafkaProperties.put("bootstrap.servers", writeServer)
      kafkaProperties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
      kafkaProperties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")

      producer = new KafkaProducer(kafkaProperties)
      true
    }

    override def process(value: Row): Unit = {

      val dataAcc = hma.getInstance(spark.sparkContext)

      val floorId = value.getAs[String]("Floor_Id")
      val haltReason = value.getAs[String]("HaltReason")
      val count = value.getAs[Long]("count")

      val t = value.getAs[Row]("window").getAs[java.sql.Timestamp](1)

      val key = floorId +":"+haltReason
      if(floorId == null || floorId.trim.length == 0){
        return;
      }
      if(dataAcc.value.contains(key)){
        val prevCount = dataAcc.value(key)
        dataAcc.value(key) = prevCount+count
      }else{
        dataAcc += (key -> count)
      }

      var day = java.util.Calendar.getInstance().get(java.util.Calendar.DATE).toLong

      if (dataAcc.value.contains("Day")) {
        val id = dataAcc.value("Day")
        if (id != day)
          dataAcc.zero
      } else {
        dataAcc += ("Day" -> day)
      }

      var oMap = dataAcc.value - "Day";
      var outputMap = new MutableHashMap[String, MutableHashMap[String,Long]]()

      for (key <- oMap.keys) {

        val fid = key.split(":")(0)
        val cat = key.split(":")(1)

        if(outputMap.contains(fid)){
          var catMap = outputMap(fid)

          if(catMap  != null && catMap.contains(cat)){
            catMap  += (cat -> (catMap (cat) + dataAcc.value(key)) )
          }else{
            outputMap(fid) += (cat ->  dataAcc.value(key))
          }
        }else{
          var catMap = new MutableHashMap[String,Long]()
          catMap += (cat -> dataAcc.value(key))
          outputMap += (fid  -> catMap)
        }
      }

      var output = ""

      for (field <- outputMap.keys) {
        output += "{ \"Floor_Id\": \"" + field + "\", \"Error_Category\" : ["
        for (error <- outputMap(field).keys) {
          output += " { \"Category\" : \"" + error + "\", \"DataPoints\" : { \"NumberOfErrors\": " + outputMap(field)(error) +
            ", \"Date_Time\" : " + t.getTime + "}},"
        }
      }

      if(output.indexOf("Floor_Id") > 0){
        if(output.endsWith(",")){
          output = output.substring(0,output.lastIndexOf(","))
        }
        output += "]}"
        //        println("output > "+output)
        producer.send(new ProducerRecord(writeTopic, output))
      }


    }

    override def close(errorOrNull: Throwable): Unit = {
      producer.close()
    }
  }).outputMode("complete")
    .trigger(Trigger.ProcessingTime("2 seconds"))
    .start.awaitTermination()

} 

class HashMapAccumulator extends AccumulableParam[MutableHashMap[String, Long], (String, Long)] {

  private var accumulator: Accumulable[MutableHashMap[String, Long], (String, Long)] = _

  def addAccumulator(acc: MutableHashMap[String, Long], elem: (String, Long)): MutableHashMap[String, Long] = {
    val (k1, v1) = elem
    acc += acc.find(_._1 == k1).map {
      case (k2, v2) => k2 -> (v1 + v2)
    }.getOrElse(elem)

    acc
  }

  def addInPlace(acc1: MutableHashMap[String, Long], acc2: MutableHashMap[String, Long]): MutableHashMap[String, Long] = {
    acc2.foreach(elem => addAccumulator(acc1, elem))
    acc1
  }

  def zero(initialValue: MutableHashMap[String, Long]): MutableHashMap[String, Long] = {
    val ser = new JavaSerializer(new SparkConf(false)).newInstance()
    val copy = ser.deserialize[MutableHashMap[String, Long]](ser.serialize(initialValue))
    copy.clear()
    copy
  }

  def getInstance(sc: SparkContext): Accumulable[MutableHashMap[String, Long], (String, Long)] = {
    if (accumulator == null) {
      synchronized {
        if (accumulator == null) {
          accumulator = sc.accumulable(MutableHashMap.empty[String, Long],"ErrorCountAccumulator")(new HashMapAccumulator)
        }
      }
    }
    accumulator
  }

}

问题:

  • 为什么我已经收到旧window_times的记录 收到了吗?根据文档,结构化流确实 处理数据的延迟到达。但就我而言,我发布了数据 顺序访问Kafka Broker。
  • 这种情况是否与至少一次写入语义有关,而 写给卡夫卡?

我是Spark Streaming的初学者,不胜感激。

1 个答案:

答案 0 :(得分:0)

我通过使用outputmode Update修复了此问题。