在Spark Streaming Procss中使用Kafka DStream

时间:2018-01-23 15:55:14

标签: json apache-kafka spark-streaming

我在这样的火花流媒体计划中消耗了一个Kafka主题:

import ...

object KafkaStreaming {

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("KafkaStreaming").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(10))
    val kafkaConf = Map(
      ...
    )

    val messages = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](Seq("topic"), kafkaConf)
    )
    val lines: DStream[String] = messages.map(_.value)
    val line: DStream[String] = lines.flatMap(_.split("\n"))

    process(line)

    ssc.start()
    ssc.awaitTermination()
  }

def process(line: DStream[String]): Unit =
  {
    // here is where I want to convert the DStream to JSON
    var json: Option[Any] = JSON.parseFull(line) // <--
    println(json.getOrElse("json is NULL"))
    if(json.isEmpty == false) {
      println("NOT FALSE")
      var map = json.get.asInstanceOf[Map[String, Any]]

     // use every member of JSON document to access the value
      map.get("any json element").toString
     // do some other manipulation
    }
  }
}

process函数内部,我想操纵每一行字符串以从中提取JSON对象并执行进一步处理和持久化。我该怎么办?

1 个答案:

答案 0 :(得分:1)

使用DStream[String]然后使用DStream.map,而不是foreachRDD

def process(line: String): Unit = ???

然后:

messages
 .map(_.value)
 .flatMap(_.split("\n"))
 .map(process)
 .foreachRDD { rdd =>
   rdd.foreachPartition { itr => 
   // Do stuff with `Iterator[String]` after JSON transformation
  }
}