def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("kafka-spark-demo")
val scc = new StreamingContext(sparkConf, Duration(5000))
val topics = Set("test1")
val kafkaParam = Map(
"metadata.broker.list" -> "localhost:9092"
)
val stream: InputDStream[(String, String)] = createStream(scc, kafkaParam, topics)
val words = stream.map(_._2)
.flatMap(_.split(" "))
// Convert RDDs of the words DStream to DataFrame and run SQL query
words.foreachRDD { (rdd: RDD[String], time: Time) =>
// Get the singleton instance of SparkSession
val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
import spark.implicits._
// Convert RDD[String] to RDD[case class] to DataFrame
val wordsDataFrame = rdd.map(w => Record(w)).toDF()
// Creates a temporary view using the DataFrame
wordsDataFrame.createOrReplaceTempView("words")
// Do word count on table using SQL and print it
val wordCountsDataFrame =
spark.sql("select word, count(*) as total from words group by word")
wordCountsDataFrame.show() wordCountsDataFrame.rdd.saveAsTextFile(dir)
}
scc.start()
scc.awaitTermination()
}
这是我的代码,关于使用spark-streaming来执行sql消息并将结果保存到HDFS中,但我发现我输出了大量空文件或小文件(只有一行数据)而不是表格。
答案 0 :(得分:0)
请尝试这个并告诉我它是否有帮助。
wordCountsDataFrame.rdd.repartition(1).saveAsTextFile("/path/to/output")