使用spark-streaming来执行sql并将DataFrame写入spark中的HDFS作为表而不是许多空文件或小文件

时间:2017-11-17 08:58:03

标签: hdfs spark-streaming

def main(args: Array[String]) {
  val sparkConf = new   SparkConf().setMaster("local[2]").setAppName("kafka-spark-demo")
  val scc = new StreamingContext(sparkConf, Duration(5000))
  val topics = Set("test1")
  val kafkaParam = Map(
  "metadata.broker.list" -> "localhost:9092"
)

  val stream: InputDStream[(String, String)] = createStream(scc, kafkaParam, topics)

  val words = stream.map(_._2)
  .flatMap(_.split(" "))

  // Convert RDDs of the words DStream to DataFrame and run SQL query
  words.foreachRDD { (rdd: RDD[String], time: Time) =>
  // Get the singleton instance of SparkSession
  val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
  import spark.implicits._

  // Convert RDD[String] to RDD[case class] to DataFrame
  val wordsDataFrame = rdd.map(w => Record(w)).toDF()

  // Creates a temporary view using the DataFrame
  wordsDataFrame.createOrReplaceTempView("words")

  // Do word count on table using SQL and print it
  val wordCountsDataFrame =
    spark.sql("select word, count(*) as total from words group by word")
  wordCountsDataFrame.show()   wordCountsDataFrame.rdd.saveAsTextFile(dir)
}
  scc.start()
  scc.awaitTermination()

}

这是我的代码,关于使用spark-streaming来执行sql消息并将结果保存到HDFS中,但我发现我输出了大量空文件或小文件(只有一行数据)而不是表格。

1 个答案:

答案 0 :(得分:0)

请尝试这个并告诉我它是否有帮助。

wordCountsDataFrame.rdd.repartition(1).saveAsTextFile("/path/to/output")