重启ReduceByKeyAndWindows

时间:2016-03-25 23:15:07

标签: scala apache-spark spark-streaming

我是Spark和Scala的新手,我使用 ReduceByKeyAndWindows 来计算kafka消息中的单词,因为我需要使用窗口功能。

当检测到“x”次来自kafka的消息在特定时间内包含特定单词时,我的应用程序的目的是发送警报。然后,从开始重新开始。

下面的代码检测到了这个词,但是我无法重新启动我的应用程序。我在想是否有可能重新开始ReduceByKeyAndWindows的积累或其他方式来做到这一点。

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3

object KafKaWordCount {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local[2]").setAppName("KafKaWordCount")
    val ssc = new StreamingContext(conf, Seconds(2))

    ssc.checkpoint("checkpoint")

    val lines = ssc.socketTextStream("localhost", 9999) //using NETCAT for test 
    val wordCounts = 
        lines.map(x => (x, 1))
             .reduceByKeyAndWindow(_ + _, _ - _, Seconds(60), Seconds(2), 2) 

        //if the value from the key (word) exceeds 10 , sent alert and Restart the values
        wordCounts.print()
        ssc.start()
        ssc.awaitTermination()
    }
}

使用 Yuval Itzchakov 中的第二个示例 并将覆盖范围从10减少到3并发送7条消息。

第二个asnwer的输出是

Word: hello reached count: 1
Word: hello reached count: 2
//No print this message, its OK but the next word not start with 1
Word: hello reached count: 4
Word: hello reached count: 5
Word: hello reached count: 6
Word: hello reached count: 7

我期待的输出

Word: hello reached count: 1
Word: hello reached count: 2

Word: hello reached count: 1
Word: hello reached count: 2

Word: hello reached count: 1

1 个答案:

答案 0 :(得分:0)

如果您使用的是Spark 1.6.0及更高版本,则可以使用实验DStream.mapWithState来保持单词计数的更新状态。达到限制后,您可以删除状态并将其释放到管道中,然后使用DStream.foreach打印出来:

object KafKaWordCount {
  def main(args: Array[String]) {
    val conf = new SparkConf()
    .setMaster("local[2]")
    .setAppName("KafKaWordCount")

    val ssc = new StreamingContext(conf, Seconds(2))
    ssc.checkpoint("checkpoint")

    val lines = ssc.socketTextStream("localhost", 9999) //using NETCAT for test
    val stateSpec = StateSpec.function(updateWordCount _)

    lines.map(x => (x, 1))
      .reduceByKeyAndWindow(_ + _, _ - _, Seconds(60), Seconds(2), 2)
      .mapWithState(stateSpec)
      .filter(_.isDefined)
      .foreachRDD(rdd => 
                  rdd.foreach { case (word, count) => 
                    println(s"Word: $word reached count: $count") })
    ssc.start()
    ssc.awaitTermination()
  }

  def updateWordCount(key: String, 
                      value: Option[Int], 
                      state: State[(String, Int)]): Option[(String, Int)] = {
    def updateCountState(count: Int): Option[(String, Int)] = {
      if (count == 10) {
        if (state.exists()) state.remove()
        Some((key, count))
      }
      else {
        state.update((key, count))
        None
      }
    }

    value match {
      case Some(count) => updateCountState(count)
      case _ => None
    }
  }
}

如果没有,您可以按照较慢的DStream.updateStateByKey

object KafKaWordCount {
  def main(args: Array[String]) {
    val conf = new SparkConf()
    .setMaster("local[2]")
    .setAppName("KafKaWordCount")
    val ssc = new StreamingContext(conf, Seconds(2))

    ssc.checkpoint("checkpoint")

    val lines = ssc.socketTextStream("localhost", 9999) //using NETCAT for test

    lines.map(x => (x, (x, 1)))
         .reduceByKeyAndWindow((first: (String, Int), second: (String, Int)) =>
                               (first._1, first._2 + second._2), Seconds(60), Seconds(60), 2)
         .updateStateByKey(updateSeqCount _)
         .print(1)

    ssc.start()
    ssc.awaitTermination()
  }

  def updateSeqCount(values: Seq[(String, Int)], 
                     state: Option[(String, Int]): Option[(String, Int)] = {
        if (values.isEmpty) state
        else {
          val (word, count) = values.head  
          if (count == 10) {
            println(s"Key: $word reached count $count!")
            None 
          }
          else Some((word, count))
        }
    }
}