Avro邮件在Scala中转换为DF

时间:2019-05-03 19:47:10

标签: scala apache-spark apache-kafka spark-streaming

我正在以avro格式提取kafka消息,并在其顶部创建DF,但是finalDF即将显示为Empty。甚至可以正常填充bean,并且每个函数中的值也可以正常工作。

package com.virginvoyages.sparkstreaming
import scala.util.parsing.json.JSON
import org.apache.kafka.clients.CommonClientConfigs
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.config.SslConfigs
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.Level
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Assign
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import com.virginvoyages.util.KafkaOffsetStreamingDriver
import com.virginvoyages.sparkstreaming.StreamConstants
import io.confluent.kafka.serializers.KafkaAvroDeserializer
import org.apache.spark.SparkContext
object FeedbackAvroMessageTest extends StreamConstants {
    /**
    * Initialize logger
    */
    val log = LogManager.getRootLogger
    log.setLevel(Level.INFO)
    def main(args: Array[String]): Unit = {
        /**
        * Initialize spark context
        */
        // start spark session
        def getSparkSession() =
        {
            val spark = SparkSession
            .builder()
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .getOrCreate()
            spark
        }
        val spark = getSparkSession()
        val streamingContext = new StreamingContext(spark.sparkContext, Seconds(spark.sparkContext.getConf.get("spark.personcreated.seconds").toInt))
        //streamingContext.checkpoint("." + System.currentTimeMillis())
        spark.sparkContext.getConf.getAll
        /**
        * Create Topic Array
        */
        val kafkaParams = Map[String, Object](
            ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> spark.sparkContext.getConf.get("spark.sslkafkabrokers").trim,
            ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
            ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[KafkaAvroDeserializer],
            "schema.registry.url" -> spark.sparkContext.getConf.get("spark.crmacxiom.schemaregistry"),
            // "specific.avro.reader" -> "true",
            ConsumerConfig.GROUP_ID_CONFIG -> spark.sparkContext.getConf.get("spark.crmacxiom.consumer").trim,
            // ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> spark.sparkContext.getConf.get("spark.crmacxiom.offset").trim(),
            ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean),
            CommonClientConfigs.SECURITY_PROTOCOL_CONFIG -> "SSL",
            SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG -> spark.sparkContext.getConf.get("spark.passvalue").trim,
            /**
        * Fetch JKS files from Current Spark Job Working Dir
        */
            //filling all config details for kafka and schema regsitry config
            SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG -> spark.sparkContext.getConf.get("spark.truststore").trim,
            SslConfigs.SSL_KEYSTORE_LOCATION_CONFIG -> spark.sparkContext.getConf.get("spark.keystore").trim,
            /setting ssl details
            SslConfigs.SSL_KEYSTORE_PASSWORD_CONFIG -> spark.sparkContext.getConf.get("spark.passvalue").trim,
            SslConfigs.SSL_KEY_PASSWORD_CONFIG -> spark.sparkContext.getConf.get("spark.passvalue").trim)
        /*
        * Reading offset from Hbase
        */
        //offset is reading from hbase
        log.info("Reading From offset from Hbase****************")
        val fromOffsets = KafkaOffsetStreamingDriver.getLastCommittedOffset(spark.sparkContext.getConf.get("spark.feedback.kafka.topic").trim, spark.sqlContext, spark)
        log.info("Completed Reading offset from Hbase****************")
        /**
        * Create a Direct Stream to read the Salesforce Messages
        */
        //creating direct stream
        val messages = KafkaUtils.createDirectStream[String, Object](streamingContext, PreferConsistent, Assign[String, Object](fromOffsets.keys, kafkaParams, fromOffsets))
        //broadcast details
        log.info("in Read :::::: ");
        val sparkConfiguration = spark.sparkContext.broadcast(spark.sparkContext.getConf.getAll.toMap)
        messages.foreachRDD {
            (rdd, batchTime) =>
            val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
            log.info("Rdd Obj -----------> " + rdd);
            //passing rdd to method
            val msgTuple = createAvroMsgs(rdd);
            log.info("----------RDD Count :----------" + rdd.count())
            if (rdd.count() > 0) {
                val fd=msgTuple.filter(feedback => feedback!=null).map(
                    record => {
                        val ACTIVITYID = record.ACTIVITYID
                        val SAILORID = record.SAILORID
                        val REQUESTSOURCE = record.REQUESTSOURCE
                        val SAILORSELECTION = record.SAILORSELECTION
                        val ROWKEY = record.ROWKEY
                        val TIMESTAMP = record.TIMESTAMP
                        (ACTIVITYID,SAILORID,REQUESTSOURCE,SAILORSELECTION,ROWKEY,TIMESTAMP)
                    })
                import spark.sqlContext.implicits._
                val finalDF= fd.toDF("ACTIVITYID","SAILORID","REQUESTSOURCE","SAILORSELECTION","ROWKEY","TIMESTAMP")
                finalDF.show() // Here this DaTaFrame is Giving me Empty ??
                //Need Suggestion why it is empty
            }
        }
        streamingContext.start()
        streamingContext.awaitTermination()
    }
    //streaming stop here
    /**
    * Iterate RDD and return a Tuple with JSON Msg
    */
    //takes main RDD
    //which fill continuous bean details
    //continuous filling bean and return Bean object
    def createAvroMsgs(msgRdd: RDD[ConsumerRecord[String, Object]]) = {
        val tupleDF = msgRdd.map {
            record =>
            val avroMsg = record.value
            val dfinal=parseRequestSource(avroMsg)
            (dfinal)
        }
        // val tuple = tupleDF.collect();
        tupleDF
    }

    //filling bean all details  iterating values
    def parseSailorFeedback(messages: Object, requestSource: String) : FEEDBACK = {
        var feedbackfullList: List[(String, String)] = List()
        var feedbackDetails:FEEDBACK = null;
        var sailorid: String = "NA"
        var activityid: String = "NA"
        import org.apache.spark.sql._
        import java.time.Instant
        var unixTimestamp: Long = Instant.now.getEpochSecond
        val dataRefreshReqMsg = JSON.parseFull(messages.toString())
        dataRefreshReqMsg match {
            case Some(rootMap: Map[String, Map[String, String]]) => rootMap.get("pl") match {
                case Some(feedbackMap: Map[String, List[Map[String, String]]]) => feedbackMap.get("sailorFeedback") match {
                    case Some(feedback: List[Map[String, String]]) =>
                    feedback.foreach {
                        feedbackList: Map[String, String] =>
                        //feedbackfullList = feedbackfullList :+ (feedbackList.getOrElse("sailorId", "NA"), feedbackList.getOrElse("activityId", "NA"))
                        val feedbackDetails= FEEDBACK(feedbackList.get("sailorId").get,feedbackList.get("activityId").get,requestSource,feedbackList.get("sailorSelection").get, unixTimestamp.toString(),"")
                    }
                    log.info("feedbackfullList : " + feedbackfullList)
                }
            }
        }
        return feedbackDetails
    }

    //filling bean to all values //adding context here
    def parseRequestSource(messages: Object): FEEDBACK = {
        var feedbackDetails:FEEDBACK = null;
        val dataRefreshReqMsg = JSON.parseFull(messages.toString())
        var requestSource: String = null
        dataRefreshReqMsg match {
            case Some(rootMap: Map[String, Map[String, String]]) => rootMap.get("pl") match {
                case Some(evtMap: Map[String, String]) => {
                    log.info("requestSource --> " + evtMap.get("requestSource").get);
                    requestSource = evtMap.get("requestSource").get;
                    val feedbackDetails =  parseSailorFeedback(messages, requestSource)
                }
                case None => {
                    log.info("Null in requestSource:  " + dataRefreshReqMsg);
                }
            }
        }
        return feedbackDetails
    }
}

FinalDF即将显示为空。我不确定我在哪里犯错。

none 
+----------+--------+-------------+---------------+------+---------+  
|ACTIVITYID|SAILORID|REQUESTSOURCE|SAILORSELECTION|ROWKEY|TIMESTAMP|
+----------+--------+-------------+---------------+------+---------+
+----------+--------+-------------+---------------+------+---------+

0 个答案:

没有答案