连接两个具有复杂嵌套结构的Spark DStream

时间:2018-11-23 15:49:12

标签: apache-spark apache-spark-sql spark-streaming

我已经实现了Spark自定义接收器,以从http / REST接收DStream,如下所示:

val mem1Total:ReceiverInputDStream[String] = ssc.receiverStream(new CustomReceiver("httpURL1"))
val dstreamMem1:DStream[String] = mem1Total.window(Durations.seconds(30), Durations.seconds(10)) 
val mem2Total:ReceiverInputDStream[String] = ssc.receiverStream(new CustomReceiver("httpURL2"))
val dstreamMem2:DStream[String] = mem2Total.window(Durations.seconds(30), Durations.seconds(10)) 

每个流具有以下架构

val schema = StructType(Seq(
            StructField("status", StringType),
            StructField("data", StructType(Seq(
                StructField("resultType", StringType),
                StructField("result", ArrayType(StructType(Array(
                    StructField("metric", StructType(Seq(StructField("application", StringType),
                                                         StructField("component", StringType),
                                                         StructField("instance", StringType)))), 
                    StructField("value", ArrayType(StringType))
                ))))
             )
         )))) 

在这里,我可以走多远才能从dstreamMem1中提取所需的功能。

  dstreamMem1.foreachRDD { rdd =>
  import sparkSession.implicits._
  val df = rdd.toDS()                        
                    .selectExpr("cast (value as string) as myData") 
                    .select(from_json($"myData", schema).as("myDataEvent"))                  
                    .select($"myDataEvent.data.*")                    
                    .select(explode($"result").as("flat"))
                    .select($"flat.metric.*", $"flat.value".getItem(0).as("value1"), $"flat.value".getItem(1).as("value2"))
                }

但是我无法弄清楚如何将dstreamMem1与dstreamMem2结合在一起,同时还要处理复杂的结构。我可以对dstreamMem1和dstreamMem2进行联合操作。但这在我的情况下不起作用,因为“值”字段表示每个流上的不同内容。有什么想法吗?

编辑#1 基于以下资源     How to create a custom streaming data source?     https://github.com/apache/spark/pull/21145     https://github.com/hienluu/structured-streaming-sources/tree/master/streaming-sources/src/main/scala/org/structured_streaming_sources/twitter

我已经能够创建以下类来实现以下内容

        class SSPStreamMicroBatchReader(options: DataSourceOptions) extends MicroBatchReader with Logging {

          private val httpURL = options.get(SSPStreamingSource.HTTP_URL).orElse("") //.toString()
          private val numPartitions = options.get(SSPStreamingSource.NUM_PARTITIONS).orElse("5").toInt
          private val queueSize = options.get(SSPStreamingSource.QUEUE_SIZE).orElse("512").toInt
          private val debugLevel = options.get(SSPStreamingSource.DEBUG_LEVEL).orElse("debug").toLowerCase

          private val NO_DATA_OFFSET = SSPOffset(-1)
          private var startOffset: SSPOffset = new SSPOffset(-1)
          private var endOffset: SSPOffset = new SSPOffset(-1)
          private var currentOffset: SSPOffset = new SSPOffset(-1)
          private var lastReturnedOffset: SSPOffset = new SSPOffset(-2)
          private var lastOffsetCommitted : SSPOffset = new SSPOffset(-1)

          private var incomingEventCounter = 0;
          private var stopped:Boolean = false

          private var acsURLConn:HttpURLConnection = null
          private var worker:Thread = null

          private val sspList:ListBuffer[StreamingQueryStatus] = new ListBuffer[StreamingQueryStatus]()
          private var sspQueue:BlockingQueue[StreamingQueryStatus] = null

          initialize()

          private def initialize(): Unit = synchronized {
            log.warn(s"Inside initialize ....")
            sspQueue = new ArrayBlockingQueue(queueSize)

            new Thread("Socket Receiver") { log.warn(s"Inside thread ....") 
              override def run() { 
                log.warn(s"Inside run ....") 
                receive() }
            }.start()
          }

          private def receive(): Unit = {
            log.warn(s"Inside recieve() ....")
            var userInput: String = null

              acsURLConn = new AccessACS(httpURL).getACSConnection(); 

              // Until stopped or connection broken continue reading
              val reader = new BufferedReader(
                new InputStreamReader(acsURLConn.getInputStream(), java.nio.charset.StandardCharsets.UTF_8))

              userInput = reader.readLine()

              while(!stopped) {
                  // poll tweets from queue
                  val tweet:StreamingQueryStatus = sspQueue.poll(100, TimeUnit.MILLISECONDS)

                  if (tweet != null) {  
                    sspList.append(tweet);
                    currentOffset = currentOffset + 1  
                    incomingEventCounter = incomingEventCounter + 1;
                  }
                }
              reader.close()

          }


          override def planInputPartitions(): java.util.List[InputPartition[org.apache.spark.sql.catalyst.InternalRow]] = {
            synchronized {
              log.warn(s"Inside planInputPartitions ....")

              //initialize()

              val startOrdinal = startOffset.offset.toInt + 1
              val endOrdinal = endOffset.offset.toInt + 1

              internalLog(s"createDataReaderFactories: sOrd: $startOrdinal, eOrd: $endOrdinal, " +
                s"lastOffsetCommitted: $lastOffsetCommitted")

              val newBlocks = synchronized {
                val sliceStart = startOrdinal - lastOffsetCommitted.offset.toInt - 1
                val sliceEnd = endOrdinal - lastOffsetCommitted.offset.toInt - 1
                assert(sliceStart <= sliceEnd, s"sliceStart: $sliceStart sliceEnd: $sliceEnd")
                sspList.slice(sliceStart, sliceEnd)
              }

              newBlocks.grouped(numPartitions).map { block =>
                                                        new SSPStreamBatchTask(block).asInstanceOf[InputPartition[org.apache.spark.sql.catalyst.InternalRow]]
                                                   }.toList.asJava
            }
          }

          override def setOffsetRange(start: Optional[Offset],
                                      end: Optional[Offset]): Unit = {

            if (start.isPresent && start.get().asInstanceOf[SSPOffset].offset != currentOffset.offset) {
              internalLog(s"setOffsetRange: start: $start, end: $end currentOffset: $currentOffset")
            }

            this.startOffset = start.orElse(NO_DATA_OFFSET).asInstanceOf[SSPOffset]
            this.endOffset = end.orElse(currentOffset).asInstanceOf[SSPOffset]
          }

          override def getStartOffset(): Offset = {
            internalLog("getStartOffset was called")
            if (startOffset.offset == -1) {
              throw new IllegalStateException("startOffset is -1")
            }
            startOffset
          }

          override def getEndOffset(): Offset = {
            if (endOffset.offset == -1) {
              currentOffset
            } else {

              if (lastReturnedOffset.offset < endOffset.offset) {
                internalLog(s"** getEndOffset => $endOffset)")
                lastReturnedOffset = endOffset
              }

              endOffset
            }

          }

          override def commit(end: Offset): Unit = {
            internalLog(s"** commit($end) lastOffsetCommitted: $lastOffsetCommitted")

            val newOffset = SSPOffset.convert(end).getOrElse(
              sys.error(s"SSPStreamMicroBatchReader.commit() received an offset ($end) that did not " +
                s"originate with an instance of this class")
            )

            val offsetDiff = (newOffset.offset - lastOffsetCommitted.offset).toInt

            if (offsetDiff < 0) {
              sys.error(s"Offsets committed out of order: $lastOffsetCommitted followed by $end")
            }

            sspList.trimStart(offsetDiff)
            lastOffsetCommitted = newOffset
          }

          override def stop(): Unit = {
            log.warn(s"There is a total of $incomingEventCounter events that came in")
            stopped = true
            if (acsURLConn != null) {
              try {
                //acsURLConn.disconnect()
              } catch {
                case e: IOException =>
              }
            }
          }

          override def deserializeOffset(json: String): Offset = {
            SSPOffset(json.toLong)
          }
          override def readSchema(): StructType = {
            SSPStreamingSource.SCHEMA
          }

          private def internalLog(msg:String): Unit = {
            debugLevel match {
              case "warn" => log.warn(msg)
              case "info" => log.info(msg)
              case "debug" => log.debug(msg)
              case _ =>
            }
          }
        }

        object SSPStreamingSource {

          val HTTP_URL = "httpURL"
          val DEBUG_LEVEL = "debugLevel"
          val NUM_PARTITIONS = "numPartitions"
          val QUEUE_SIZE = "queueSize"

           val SCHEMA = StructType(Seq(
                            StructField("status", StringType),
                            StructField("data", StructType(Seq(
                                StructField("resultType", StringType),
                                StructField("result", ArrayType(StructType(Array(
                                                         StructField("application", StringType),
                                                                         StructField("component", StringType),                                                                                                                                                          
                                                                       StructField("instance", StringType)))), 
                                    StructField("value", ArrayType(StringType))
                                ))))
                             )
                         ))))      
        }

        class SSPStreamBatchTask(sspList:ListBuffer[StreamingQueryStatus]) extends InputPartition[Row] {
          override def createPartitionReader(): InputPartitionReader[Row] = new SSPStreamBatchReader(sspList)
        }

        class SSPStreamBatchReader(sspList:ListBuffer[StreamingQueryStatus]) extends InputPartitionReader[Row] {
          private var currentIdx = -1

          override def next(): Boolean = {
            // Return true as long as the new index is in the seq.
            currentIdx += 1
            currentIdx < sspList.size
          }

          override def get(): Row = {
            val tweet = sspList(currentIdx)
            Row(tweet.json)
          }

          override def close(): Unit = {}
        }

该类的用法如下

    val a = sparkSession.readStream
            .format(providerClassName)
            .option(SSPStreamingSource.HTTP_URL, httpMemTotal)
            .load()
    a.printSchema()

    a.writeStream
            .outputMode(OutputMode.Append())
            .option("checkpointLocation", "/home/localCheckpoint1") //local
            .start("/home/sparkoutput/aa00a01")   

这是错误。我还没有破解:(

    18/11/29 13:33:28 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
    18/11/29 13:33:28 WARN SSPStreamingSource: Inside createMicroBatchReader() ....
    18/11/29 13:33:28 WARN SSPStreamMicroBatchReader: Inside initialize ....
    18/11/29 13:33:28 WARN SSPStreamMicroBatchReader: Inside thread ....
    18/11/29 13:33:28 WARN SSPStreamMicroBatchReader: There is a total of 0 events that came in
    18/11/29 13:33:28 WARN SSPStreamMicroBatchReader: Inside run ....
    18/11/29 13:33:28 WARN SSPStreamMicroBatchReader: Inside recieve() ....
    root
     |-- status: string (nullable = true)
     |-- data: struct (nullable = true)
     |    |-- resultType: string (nullable = true)
     |    |-- result: array (nullable = true)
     |    |    |-- element: struct (containsNull = true)
     |    |    |    |-- metric: struct (nullable = true)
     |    |    |    |    |-- application: string (nullable = true)
     |    |    |    |    |-- component: string (nullable = true)
     |    |    |    |    |-- instance: string (nullable = true)
     |    |    |    |-- value: array (nullable = true)
     |    |    |    |    |-- element: string (containsNull = true)

    18/11/29 13:33:30 INFO MicroBatchExecution: Starting [id = f15252df-96d8-45b4-a6db-83fd4c7aed71, runId = 65a6dc28-5eb4-468a-80c3-f547504689d7]. Use file:///home/localCheckpoint1 to store the query checkpoint.
    18/11/29 13:33:30 WARN SSPStreamingSource: Inside createMicroBatchReader() ....
    18/11/29 13:33:30 WARN SSPStreamMicroBatchReader: Inside initialize ....
    18/11/29 13:33:30 ERROR StreamingContext: Error starting the context, marking it as stopped
    java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
      at scala.Predef$.require(Predef.scala:224)
      at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:168)
      at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:513)
      at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:573)
      at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:572)
      at myproject.spark.predictive_monitoring.predictmyproject$.run(predictmyproject.scala:99)
      at myproject.spark.predictive_monitoring.predictmyproject$.main(predictmyproject.scala:31)
      at myproject.spark.predictive_monitoring.predictmyproject.main(predictmyproject.scala)
    Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
      at scala.Predef$.require(Predef.scala:224)
      at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:168)
      at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:513)
      at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:573)
      at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:572)
      at myproject.spark.predictive_monitoring.predictmyproject$.run(predictmyproject.scala:99)
      at myproject.spark.predictive_monitoring.predictmyproject$.main(predictmyproject.scala:31)
      at myproject.spark.predictive_monitoring.predictmyproject.main(predictmyproject.scala)
    18/11/29 13:33:30 INFO SparkContext: Invoking stop() from shutdown hook
    18/11/29 13:33:30 WARN SSPStreamMicroBatchReader: Inside thread ....
    18/11/29 13:33:30 WARN SSPStreamMicroBatchReader: Inside run ....
    18/11/29 13:33:30 WARN SSPStreamMicroBatchReader: Inside recieve() ....
    18/11/29 13:33:30 INFO MicroBatchExecution: Using MicroBatchReader [myproject.spark.predictive_monitoring.SSPStreamMicroBatchReader@74cc1ddc] from DataSourceV2 named 'myproject.spark.predictive_monitoring.SSPStreamingSource' [myproject.spark.predictive_monitoring.SSPStreamingSource@7e503c3]
    18/11/29 13:33:30 INFO SparkUI: Stopped Spark web UI at http://172.16.221.232:4040
    18/11/29 13:33:30 ERROR MicroBatchExecution: Query [id = f15252df-96d8-45b4-a6db-83fd4c7aed71, runId = 65a6dc28-5eb4-468a-80c3-f547504689d7] terminated with error
    java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
    This stopped SparkContext was created at:

    org.apache.spark.SparkContext.<init>(SparkContext.scala:76)
    org.apache.spark.streaming.StreamingContext$.createNewSparkContext(StreamingContext.scala:838)
    org.apache.spark.streaming.StreamingContext.<init>(StreamingContext.scala:85)
    myproject.spark.predictive_monitoring.predictmyproject$.run(predictmyproject.scala:37)
    myproject.spark.predictive_monitoring.predictmyproject$.main(predictmyproject.scala:31)
    myproject.spark.predictive_monitoring.predictmyproject.main(predictmyproject.scala)

    The currently active SparkContext was created at:

    org.apache.spark.SparkContext.<init>(SparkContext.scala:76)
    org.apache.spark.streaming.StreamingContext$.createNewSparkContext(StreamingContext.scala:838)
    org.apache.spark.streaming.StreamingContext.<init>(StreamingContext.scala:85)
    myproject.spark.predictive_monitoring.predictmyproject$.run(predictmyproject.scala:37)
    myproject.spark.predictive_monitoring.predictmyproject$.main(predictmyproject.scala:31)
    myproject.spark.predictive_monitoring.predictmyproject.main(predictmyproject.scala)

      at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:100)
      at org.apache.spark.sql.SparkSession.<init>(SparkSession.scala:91)
      at org.apache.spark.sql.SparkSession.cloneSession(SparkSession.scala:256)
      at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:268)
      at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
    18/11/29 13:33:30 WARN SSPStreamMicroBatchReader: There is a total of 0 events that came in
    18/11/29 13:33:30 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
    18/11/29 13:33:30 INFO MemoryStore: MemoryStore cleared
    18/11/29 13:33:30 INFO BlockManager: BlockManager stopped
    18/11/29 13:33:30 INFO BlockManagerMaster: BlockManagerMaster stopped
    18/11/29 13:33:30 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
    18/11/29 13:33:31 INFO SparkContext: Successfully stopped SparkContext

0 个答案:

没有答案