我正在以avro格式提取kafka消息,并在其顶部创建DF,但是finalDF即将显示为Empty。甚至可以正常填充bean,并且每个函数中的值也可以正常工作。
package com.virginvoyages.sparkstreaming import scala.util.parsing.json.JSON import org.apache.kafka.clients.CommonClientConfigs import org.apache.kafka.clients.consumer.ConsumerConfig import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.config.SslConfigs import org.apache.kafka.common.serialization.StringDeserializer import org.apache.log4j.Level import org.apache.log4j.LogManager import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.kafka010.ConsumerStrategies.Assign import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010.HasOffsetRanges import com.virginvoyages.util.KafkaOffsetStreamingDriver import com.virginvoyages.sparkstreaming.StreamConstants import io.confluent.kafka.serializers.KafkaAvroDeserializer import org.apache.spark.SparkContext object FeedbackAvroMessageTest extends StreamConstants { /** * Initialize logger */ val log = LogManager.getRootLogger log.setLevel(Level.INFO) def main(args: Array[String]): Unit = { /** * Initialize spark context */ // start spark session def getSparkSession() = { val spark = SparkSession .builder() .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .getOrCreate() spark } val spark = getSparkSession() val streamingContext = new StreamingContext(spark.sparkContext, Seconds(spark.sparkContext.getConf.get("spark.personcreated.seconds").toInt)) //streamingContext.checkpoint("." + System.currentTimeMillis()) spark.sparkContext.getConf.getAll /** * Create Topic Array */ val kafkaParams = Map[String, Object]( ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> spark.sparkContext.getConf.get("spark.sslkafkabrokers").trim, ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[KafkaAvroDeserializer], "schema.registry.url" -> spark.sparkContext.getConf.get("spark.crmacxiom.schemaregistry"), // "specific.avro.reader" -> "true", ConsumerConfig.GROUP_ID_CONFIG -> spark.sparkContext.getConf.get("spark.crmacxiom.consumer").trim, // ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> spark.sparkContext.getConf.get("spark.crmacxiom.offset").trim(), ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean), CommonClientConfigs.SECURITY_PROTOCOL_CONFIG -> "SSL", SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG -> spark.sparkContext.getConf.get("spark.passvalue").trim, /** * Fetch JKS files from Current Spark Job Working Dir */ //filling all config details for kafka and schema regsitry config SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG -> spark.sparkContext.getConf.get("spark.truststore").trim, SslConfigs.SSL_KEYSTORE_LOCATION_CONFIG -> spark.sparkContext.getConf.get("spark.keystore").trim, /setting ssl details SslConfigs.SSL_KEYSTORE_PASSWORD_CONFIG -> spark.sparkContext.getConf.get("spark.passvalue").trim, SslConfigs.SSL_KEY_PASSWORD_CONFIG -> spark.sparkContext.getConf.get("spark.passvalue").trim) /* * Reading offset from Hbase */ //offset is reading from hbase log.info("Reading From offset from Hbase****************") val fromOffsets = KafkaOffsetStreamingDriver.getLastCommittedOffset(spark.sparkContext.getConf.get("spark.feedback.kafka.topic").trim, spark.sqlContext, spark) log.info("Completed Reading offset from Hbase****************") /** * Create a Direct Stream to read the Salesforce Messages */ //creating direct stream val messages = KafkaUtils.createDirectStream[String, Object](streamingContext, PreferConsistent, Assign[String, Object](fromOffsets.keys, kafkaParams, fromOffsets)) //broadcast details log.info("in Read :::::: "); val sparkConfiguration = spark.sparkContext.broadcast(spark.sparkContext.getConf.getAll.toMap) messages.foreachRDD { (rdd, batchTime) => val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges log.info("Rdd Obj -----------> " + rdd); //passing rdd to method val msgTuple = createAvroMsgs(rdd); log.info("----------RDD Count :----------" + rdd.count()) if (rdd.count() > 0) { val fd=msgTuple.filter(feedback => feedback!=null).map( record => { val ACTIVITYID = record.ACTIVITYID val SAILORID = record.SAILORID val REQUESTSOURCE = record.REQUESTSOURCE val SAILORSELECTION = record.SAILORSELECTION val ROWKEY = record.ROWKEY val TIMESTAMP = record.TIMESTAMP (ACTIVITYID,SAILORID,REQUESTSOURCE,SAILORSELECTION,ROWKEY,TIMESTAMP) }) import spark.sqlContext.implicits._ val finalDF= fd.toDF("ACTIVITYID","SAILORID","REQUESTSOURCE","SAILORSELECTION","ROWKEY","TIMESTAMP") finalDF.show() // Here this DaTaFrame is Giving me Empty ?? //Need Suggestion why it is empty } } streamingContext.start() streamingContext.awaitTermination() } //streaming stop here /** * Iterate RDD and return a Tuple with JSON Msg */ //takes main RDD //which fill continuous bean details //continuous filling bean and return Bean object def createAvroMsgs(msgRdd: RDD[ConsumerRecord[String, Object]]) = { val tupleDF = msgRdd.map { record => val avroMsg = record.value val dfinal=parseRequestSource(avroMsg) (dfinal) } // val tuple = tupleDF.collect(); tupleDF } //filling bean all details iterating values def parseSailorFeedback(messages: Object, requestSource: String) : FEEDBACK = { var feedbackfullList: List[(String, String)] = List() var feedbackDetails:FEEDBACK = null; var sailorid: String = "NA" var activityid: String = "NA" import org.apache.spark.sql._ import java.time.Instant var unixTimestamp: Long = Instant.now.getEpochSecond val dataRefreshReqMsg = JSON.parseFull(messages.toString()) dataRefreshReqMsg match { case Some(rootMap: Map[String, Map[String, String]]) => rootMap.get("pl") match { case Some(feedbackMap: Map[String, List[Map[String, String]]]) => feedbackMap.get("sailorFeedback") match { case Some(feedback: List[Map[String, String]]) => feedback.foreach { feedbackList: Map[String, String] => //feedbackfullList = feedbackfullList :+ (feedbackList.getOrElse("sailorId", "NA"), feedbackList.getOrElse("activityId", "NA")) val feedbackDetails= FEEDBACK(feedbackList.get("sailorId").get,feedbackList.get("activityId").get,requestSource,feedbackList.get("sailorSelection").get, unixTimestamp.toString(),"") } log.info("feedbackfullList : " + feedbackfullList) } } } return feedbackDetails } //filling bean to all values //adding context here def parseRequestSource(messages: Object): FEEDBACK = { var feedbackDetails:FEEDBACK = null; val dataRefreshReqMsg = JSON.parseFull(messages.toString()) var requestSource: String = null dataRefreshReqMsg match { case Some(rootMap: Map[String, Map[String, String]]) => rootMap.get("pl") match { case Some(evtMap: Map[String, String]) => { log.info("requestSource --> " + evtMap.get("requestSource").get); requestSource = evtMap.get("requestSource").get; val feedbackDetails = parseSailorFeedback(messages, requestSource) } case None => { log.info("Null in requestSource: " + dataRefreshReqMsg); } } } return feedbackDetails } }
FinalDF即将显示为空。我不确定我在哪里犯错。
none
+----------+--------+-------------+---------------+------+---------+
|ACTIVITYID|SAILORID|REQUESTSOURCE|SAILORSELECTION|ROWKEY|TIMESTAMP|
+----------+--------+-------------+---------------+------+---------+
+----------+--------+-------------+---------------+------+---------+