我正在使用 KafkaUtils.createDirectStream 从火花流中的kafka主题获取消息。我使用spark2-submit(cloudera distribution 5.10)提交我的spark工作。问题是火花流式传输作业无法从kafka主题获取消息。日志没有说什么,也没有错误消息。
我只能在登录中看到输出,如下所示:
INFO utils.AppInfoParser:Kafka版本:0.10.0.0 INFO utils.AppInfoParser:Kafka commitId:b8642491e78c5a13
我正在使用kafka-console-producer将消息发布到主题。 当我使用kafka-console-consumer获取消息时,它被成功检索。
以下是我的配置:
val kafkaParams = Map(
"bootstrap.servers" -> "ud-hadoopn-1:9092",
"auto.offset.reset" -> "latest",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-consumer")
请在下面找到完整的火花流代码:
package com.model
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.streaming.{ StreamingContext, Seconds }
import org.apache.spark.ml.{ Pipeline, PipelineStage, PipelineModel }
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.linalg.Vector
import java.io.StringWriter
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.{ DeserializationFeature,
SerializationFeature }
import com.fasterxml.jackson.annotation.JsonProperty
import com.fasterxml.jackson.core.JsonParseException
import org.apache.spark.sql.functions.udf
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.common.serialization.{ StringSerializer,
StringDeserializer }
import java.util.Properties
import org.apache.spark.streaming.kafka010.{ KafkaUtils,
LocationStrategies,
ConsumerStrategies }
import scala.util.parsing.json.JSONObject
object StreamingApp {
def main(args: Array[String]) {
case class Prediction(
fabricKey: Long,
prediction: Long)
val conf = new SparkConf().setAppName("StreamingApp").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val errorRecords = sc.accumulator(0)
val spark = SparkSession.builder().master("local[*]")
.appName("StreamingApp").getOrCreate()
val ssc = new StreamingContext(sc, Seconds(1))
import sqlContext._
import sqlContext.implicits._
val inputTopic = "mlIn"
val kafkaParams = Map(
"bootstrap.servers" -> "ud-hadoopn-1:9092",
"auto.offset.reset" -> "latest",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark-consumer")
val outputTopic = "mlOut"
val kafkaParamsProd = Map(
"bootstrap.servers" -> "ud-hadoopn-1:9092",
"key.serializer" -> classOf[StringSerializer],
"value.serializer" -> classOf[StringSerializer],
"group.id" -> "spark-consumer")
val modelC =
CrossValidatorModel.load("/user/root/src/main/resources/model")
val modelBC =ssc.sparkContext.broadcast(modelC)
val jacksonWrapper = ssc.sparkContext.broadcast(JacksonWrapper())
val kafkaSink = ssc.sparkContext.broadcast(KafkaSink(kafkaParamsProd))
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set(inputTopic),
kafkaParams))
val lines = stream.map(consumerRecord =>
consumerRecord.value().asInstanceOf[String])
lines.foreachRDD(jsonRDD =>
{
import spark.implicits._
val dataDF = spark.read.json(jsonRDD)
dataDF.createOrReplaceTempView("mytable")
if (spark.sql("SELECT * FROM mytable").count() > 0) {
println("************mytable********")
spark.sql("SELECT * FROM mytable").show()
val predictions = modelBC.value.transform(dataDF)
println("************Prediction********")
predictions.select("KEY","prediction").show()
val pDF1 = predictions.select("KEY","prediction")
val pDF= pDF1.columns.foldLeft(pDF1) {
(newDF, colName) => newDF.withColumn(colName,
pDF1(colName).cast("long"))
}
val x = pDF.rdd.map( row => Prediction(
row.getLong(0),row.getLong(1) ) )
x.foreach { msg =>
val json =jacksonWrapper.value.send(msg)
println("***json=" + json)
kafkaSink.value.send("mlOut", json)
}
}
})
ssc.start()
ssc.awaitTermination()
}
}