编写了一个代码,该文件在给定文件时(生产者中)指导流(kafka)字数
代码:
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
## Constants
APP_NAME = "PythonStreamingDirectKafkaWordCount"
##OTHER FUNCTIONS/CLASSES
def main():
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
需要使用Dstream将输入的json文件转换为spark Dataframe。
答案 0 :(得分:1)
这应该有效:
一旦您的变量包含TransformedDStream kvs
,就可以创建DStream的映射并将数据传递给这样的处理函数:
data = kvs.map( lambda tuple: tuple[1] )
data.foreachRDD( lambda yourRdd: readMyRddsFromKafkaStream( yourRdd ) )
您应该定义应该使用JSON数据创建数据框的处理程序函数:
def readMyRddsFromKafkaStream( readRdd ):
# Put RDD into a Dataframe
df = spark.read.json( readRdd )
df.registerTempTable( "temporary_table" )
df = spark.sql( """
SELECT
*
FROM
temporary_table
""" )
df.show()
希望它会有所帮助:)