无法从火花流中的kafka主题获取消息

时间:2017-10-27 11:48:42

标签: apache-kafka spark-streaming

我正在使用 KafkaUtils.createDirectStream 从火花流中的kafka主题获取消息。我使用spark2-submit(cloudera distribution 5.10)提交我的spark工作。问题是火花流式传输作业无法从kafka主题获取消息。日志没有说什么,也没有错误消息。

我只能在登录中看到输出,如下所示:

INFO utils.AppInfoParser:Kafka版本:0.10.0.0    INFO utils.AppInfoParser:Kafka commitId:b8642491e78c5a13

我正在使用kafka-console-producer将消息发布到主题。 当我使用kafka-console-consumer获取消息时,它被成功检索。

以下是我的配置:

val kafkaParams = Map(
"bootstrap.servers" -> "ud-hadoopn-1:9092",
"auto.offset.reset" -> "latest",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-consumer")

请在下面找到完整的火花流代码:

  package com.model

  import org.apache.spark._
  import org.apache.spark.rdd.RDD

 import org.apache.spark.sql.SQLContext

 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql._
 import org.apache.spark.ml.classification.RandomForestClassifier
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.StringIndexer
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.ml.tuning.CrossValidatorModel
 import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
 import org.apache.spark.streaming.{ StreamingContext, Seconds }
 import org.apache.spark.ml.{ Pipeline, PipelineStage, PipelineModel }
 import org.apache.spark.mllib.evaluation.RegressionMetrics
 import org.apache.spark.ml.linalg.DenseVector
 import org.apache.spark.ml.linalg.Vector
 import java.io.StringWriter
 import com.fasterxml.jackson.module.scala.DefaultScalaModule
 import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
 import com.fasterxml.jackson.databind.ObjectMapper
 import com.fasterxml.jackson.databind.{ DeserializationFeature, 
 SerializationFeature }
 import com.fasterxml.jackson.annotation.JsonProperty
 import com.fasterxml.jackson.core.JsonParseException
 import org.apache.spark.sql.functions.udf
 import org.apache.kafka.clients.producer.ProducerRecord
 import org.apache.kafka.common.serialization.{ StringSerializer, 
 StringDeserializer }
 import java.util.Properties
 import org.apache.spark.streaming.kafka010.{ KafkaUtils, 
 LocationStrategies, 
 ConsumerStrategies }
 import scala.util.parsing.json.JSONObject

object StreamingApp {

def main(args: Array[String]) {

case class Prediction(
  fabricKey: Long,
  prediction: Long)

val conf = new SparkConf().setAppName("StreamingApp").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val errorRecords = sc.accumulator(0)
val spark = SparkSession.builder().master("local[*]") 
.appName("StreamingApp").getOrCreate()

val ssc = new StreamingContext(sc, Seconds(1))

import sqlContext._
import sqlContext.implicits._

val inputTopic = "mlIn"
val kafkaParams = Map(
  "bootstrap.servers" -> "ud-hadoopn-1:9092",
  "auto.offset.reset" -> "latest",
  "key.deserializer" -> classOf[StringDeserializer],
  "value.deserializer" -> classOf[StringDeserializer],
  "group.id" -> "spark-consumer")

val outputTopic = "mlOut"
val kafkaParamsProd = Map(
  "bootstrap.servers" -> "ud-hadoopn-1:9092",
  "key.serializer" -> classOf[StringSerializer],
  "value.serializer" -> classOf[StringSerializer],
  "group.id" -> "spark-consumer")

 val modelC = 
 CrossValidatorModel.load("/user/root/src/main/resources/model")
 val modelBC =ssc.sparkContext.broadcast(modelC)
 val jacksonWrapper = ssc.sparkContext.broadcast(JacksonWrapper())
 val kafkaSink = ssc.sparkContext.broadcast(KafkaSink(kafkaParamsProd))

 val stream = KafkaUtils.createDirectStream[String, String](
  ssc,
  LocationStrategies.PreferConsistent,
  ConsumerStrategies.Subscribe[String, String](Set(inputTopic), 
kafkaParams))

val lines = stream.map(consumerRecord => 
consumerRecord.value().asInstanceOf[String])

lines.foreachRDD(jsonRDD =>
  {
              import spark.implicits._
              val dataDF =  spark.read.json(jsonRDD)
              dataDF.createOrReplaceTempView("mytable")
              if (spark.sql("SELECT * FROM mytable").count() > 0) {
                  println("************mytable********")
                  spark.sql("SELECT * FROM mytable").show()
                  val predictions = modelBC.value.transform(dataDF)
                  println("************Prediction********")
                  predictions.select("KEY","prediction").show()
                  val pDF1 = predictions.select("KEY","prediction")
                  val pDF=  pDF1.columns.foldLeft(pDF1) {
                    (newDF, colName) => newDF.withColumn(colName, 
              pDF1(colName).cast("long"))
            }
                   val x = pDF.rdd.map( row => Prediction( 
        row.getLong(0),row.getLong(1) ) )
                   x.foreach { msg =>
                     val json =jacksonWrapper.value.send(msg)
                     println("***json=" + json)
                     kafkaSink.value.send("mlOut", json)
                  }
              }


    })

  ssc.start()
  ssc.awaitTermination()

  }
 }

0 个答案:

没有答案