我是Spark和Scala的新手,我使用 ReduceByKeyAndWindows 来计算kafka消息中的单词,因为我需要使用窗口功能。
当检测到“x”次来自kafka的消息在特定时间内包含特定单词时,我的应用程序的目的是发送警报。然后,从开始重新开始。
下面的代码检测到了这个词,但是我无法重新启动我的应用程序。我在想是否有可能重新开始ReduceByKeyAndWindows的积累或其他方式来做到这一点。
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
object KafKaWordCount {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[2]").setAppName("KafKaWordCount")
val ssc = new StreamingContext(conf, Seconds(2))
ssc.checkpoint("checkpoint")
val lines = ssc.socketTextStream("localhost", 9999) //using NETCAT for test
val wordCounts =
lines.map(x => (x, 1))
.reduceByKeyAndWindow(_ + _, _ - _, Seconds(60), Seconds(2), 2)
//if the value from the key (word) exceeds 10 , sent alert and Restart the values
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
}
使用 Yuval Itzchakov 中的第二个示例 并将覆盖范围从10减少到3并发送7条消息。
第二个asnwer的输出是
Word: hello reached count: 1
Word: hello reached count: 2
//No print this message, its OK but the next word not start with 1
Word: hello reached count: 4
Word: hello reached count: 5
Word: hello reached count: 6
Word: hello reached count: 7
我期待的输出
Word: hello reached count: 1
Word: hello reached count: 2
Word: hello reached count: 1
Word: hello reached count: 2
Word: hello reached count: 1
答案 0 :(得分:0)
如果您使用的是Spark 1.6.0及更高版本,则可以使用实验DStream.mapWithState
来保持单词计数的更新状态。达到限制后,您可以删除状态并将其释放到管道中,然后使用DStream.foreach
打印出来:
object KafKaWordCount {
def main(args: Array[String]) {
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("KafKaWordCount")
val ssc = new StreamingContext(conf, Seconds(2))
ssc.checkpoint("checkpoint")
val lines = ssc.socketTextStream("localhost", 9999) //using NETCAT for test
val stateSpec = StateSpec.function(updateWordCount _)
lines.map(x => (x, 1))
.reduceByKeyAndWindow(_ + _, _ - _, Seconds(60), Seconds(2), 2)
.mapWithState(stateSpec)
.filter(_.isDefined)
.foreachRDD(rdd =>
rdd.foreach { case (word, count) =>
println(s"Word: $word reached count: $count") })
ssc.start()
ssc.awaitTermination()
}
def updateWordCount(key: String,
value: Option[Int],
state: State[(String, Int)]): Option[(String, Int)] = {
def updateCountState(count: Int): Option[(String, Int)] = {
if (count == 10) {
if (state.exists()) state.remove()
Some((key, count))
}
else {
state.update((key, count))
None
}
}
value match {
case Some(count) => updateCountState(count)
case _ => None
}
}
}
如果没有,您可以按照较慢的DStream.updateStateByKey
:
object KafKaWordCount {
def main(args: Array[String]) {
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("KafKaWordCount")
val ssc = new StreamingContext(conf, Seconds(2))
ssc.checkpoint("checkpoint")
val lines = ssc.socketTextStream("localhost", 9999) //using NETCAT for test
lines.map(x => (x, (x, 1)))
.reduceByKeyAndWindow((first: (String, Int), second: (String, Int)) =>
(first._1, first._2 + second._2), Seconds(60), Seconds(60), 2)
.updateStateByKey(updateSeqCount _)
.print(1)
ssc.start()
ssc.awaitTermination()
}
def updateSeqCount(values: Seq[(String, Int)],
state: Option[(String, Int]): Option[(String, Int)] = {
if (values.isEmpty) state
else {
val (word, count) = values.head
if (count == 10) {
println(s"Key: $word reached count $count!")
None
}
else Some((word, count))
}
}
}