我正在尝试使用Scala在Spark Streaming中创建一个问候世界。我不想从套接字或文件中读取字符串,而是想从每1秒生成一个String的线程中读取它。我的SparkStream上下文设置为使用5秒的迷你批处理。因此,我希望线程内部生成的字符串是该字符串的5倍。但是,我只计数一次我每5秒生成的字符串一次。 还有其他方法可以在Spark Streaming中实现源功能吗?
package org.sense.spark.app
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
object TestStreamCombineByKey {
def main(args: Array[String]): Unit = {
// Create a local StreamingContext with two working thread and batch interval of 1 second.
// The master requires 2 cores to prevent from a starvation scenario.
val conf = new SparkConf().setMaster("local[2]").setAppName("LocalWordCount")
val ssc = new StreamingContext(conf, Seconds(5))
// Create a DStream that will connect to hostname:port, like localhost:9999
// val lines = ssc.socketTextStream("localhost", 9999)
val queue = new scala.collection.mutable.Queue[RDD[String]]
val thread = new Thread("pool data source") {
override def run() {
while (true) {
queue.enqueue(ssc.sparkContext.parallelize(List("to be or not to be , that is the question")))
Thread.sleep(1000)
}
}
}
thread.start()
val lines = ssc.queueStream(queue)
// Split each line into words
val words = lines.flatMap(_.split(" "))
// Count each word in each batch
val pairs = words.map(word => (word, 1))
// val wordCounts = pairs.reduceByKey(_ + _)
val wordCounts = pairs.combineByKey(
(v) => (v, 1), //createCombiner
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
new HashPartitioner(3)
)
// Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.print()
ssc.start() // Start the computation
ssc.awaitTermination() // Wait for the computation to terminate
}
}
输出:
-------------------------------------------
Time: 1570784580000 ms
-------------------------------------------
(or,(1,1))
(the,(1,1))
(not,(1,1))
(is,(1,1))
(that,(1,1))
(be,(2,2))
(question,(1,1))
(to,(2,2))
(,,(1,1))
-------------------------------------------
Time: 1570784585000 ms
-------------------------------------------
(or,(1,1))
(the,(1,1))
(not,(1,1))
(is,(1,1))
(that,(1,1))
(be,(2,2))
(question,(1,1))
(to,(2,2))
(,,(1,1))