from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row,SQLContext
import sys
import requests
import traceback
conf = SparkConf()
conf.setAppName("TwitterStreamApp")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 5)
ssc.checkpoint("checkpoint_TwitterApp")
dataStream = ssc.socketTextStream("localhost", 9001)
def aggregate_tags_count(new_values, total_sum):
return sum(new_values) + (total_sum or 0)
def get_sql_context_instance(spark_context):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
return globals()['sqlContextSingletonInstance']
def process_rdd(time, rdd):
print("----------- %s -----------" % str(time))
try:
print(rdd.collect())
except:
print(traceback.print_exc())
def release_rdd(time, rdd):
rdd.unpersist()
words = dataStream.flatMap(lambda line: line.split(" "))
hashtags = words.map(lambda x: (x, 1))
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
tags_totals.foreachRDD(process_rdd)
dataStream.foreachRDD(release_rdd)
ssc.start()
ssc.awaitTermination()
代码实现了一个wordcount,并且运行没有错误。但问题是,它是累积的,而我希望每个窗口清除dataStream(在我的程序中为5秒)。
例如,如果我在第一个窗口中收到(a,10),在第二个窗口中收到(a,20),我希望它只输出(a,20)。但是我的代码输出(a,10)和(a,20)之和,即(a,30)
有人可以帮我吗?
答案 0 :(得分:0)
不要使用有状态转换。只需reduceByKey
:
from operator import add
tags_totals = (dataStream
.flatMap(lambda line: line.split(" "))
.map(lambda x: (x, 1))
.reduceByKey(add))