Spark Streaming Receiver在"最后一批记录中显示“负数”"

时间:2015-04-23 08:42:45

标签: apache-spark spark-streaming receiver

我在Spark Streaming接收器中遇到了以下问题。 UI门户在最后一批""记录中显示负数。这是什么意思?这是否意味着,最后一批数据没有完成处理?

管道明细: Spark 1.2.1&上的自定义接收器 Scala 2.10.4

Spark Streaming Receiver shows the negative numbers

以下是接收器中的代码:(与Azure SDK和Microsoft Bond有点复杂)

class AriaQueueDataReceiver(storageConnectionString: String, queueName: String, blobContainerName: String, threadCount: Int) extends Receiver[BondSerializableWrapper[EventEnvelope]](StorageLevel.MEMORY_ONLY) {

  override def onStart(): Unit = {
    // start mutliple threads to receive data to maximize IO usage
    for(i <- 0 until threadCount) {
      new Thread(s"AriaQueueDataReceiver[$i]") {
        override def run() {
          receive()
        }
      }.start()
    }
  }

  override def onStop(): Unit = {
  }

  def commit(envelopes: Iterable[EventEnvelope]): Boolean = {
    try {
      val wrappers = envelopes.map(e => new BondSerializableWrapper[EventEnvelope](e, classOf[EventEnvelope]))
      store(wrappers.iterator)
      true
    }
    catch {
      case e: Throwable =>
        false
    }
  }

  def processQueueMessage(queueMessage: CloudQueueMessage, queue: CloudQueue, blobContainer: CloudBlobContainer): Unit = {
    // read the queue message content
    val queueMessageBytes = queueMessage.getMessageContentAsByte

    if (queueMessageBytes != null) {
      // blob location in the queue message points to the actual data blob
      val blobLocation = AriaDataDeserializationUtils.getBlobLocation(queueMessageBytes)

      if (blobLocation != null) {
        val correctedBlobLocation = if (blobLocation.startsWith(blobContainer.getUri.toString)) blobLocation.substring(blobContainer.getUri.toString.length + 1) else blobLocation

        // download and read the data blob
        val blob = blobContainer.getBlockBlobReference(correctedBlobLocation)

        if (blob.exists()) {
          val bufferStream = new ByteArrayOutputStream()
          blob.download(bufferStream)
          val blobContent = bufferStream.toByteArray
          bufferStream.close()

          // deserialize the data blob into DataPackage objects
          val exceptionBuffer = new ArrayBuffer[Throwable]
          val dataPackages = AriaDataDeserializationUtils.deserializeDataPackages(blobContent, exceptionBuffer)

          // DataPackage is the schema used inside ARIA while our pipeline is supposed to deal with EventEnvelope schema.
          // here we convert DataPackage objects into EventEnvelope objects so that our pipeline logic will only deal with EventEnvelopes.
          val events = AriaDataDeserializationUtils.convertDataPackagesToEventEnvelopes(dataPackages)

          // try to commit the EventEnvelope objects into Spark Streaming.
          // if the commit failed, simply does nothing and the queue message will reappear in the queue in a fixed time period.
          if (commit(events)) {
            queue.deleteMessage(queueMessage)
            blob.delete()
          }

          // commit the exceptions as well so that the data pipeline can choose to handle the errors
          val exceptionWrappers = exceptionBuffer.map(e => {
            val wrapper = new BondSerializableWrapper[EventEnvelope](null, classOf[EventEnvelope])
            wrapper.setException(e)
            wrapper
          })

          store(exceptionWrappers.iterator)
        }
        else {
          queue.deleteMessage(queueMessage)

          val wrapper = new BondSerializableWrapper[EventEnvelope](null, classOf[EventEnvelope])
          wrapper.setException(new InternalError(s"Blob could not be found"))

          store(wrapper)
        }
      }
    }
  }

  def receive(): Unit = {
    // initialize azure storage client
    val storageAccount = CloudStorageAccount.parse(storageConnectionString)
    val queueClient = storageAccount.createCloudQueueClient()
    val queue = queueClient.getQueueReference(queueName)

    val blobClient = storageAccount.createCloudBlobClient()
    val blobContainer = blobClient.getContainerReference(blobContainerName)

    while(!isStopped()) {
      try {
        for (queueMessage <- queue.retrieveMessages(32)) {
          processQueueMessage(queueMessage, queue, blobContainer)
        }
      }
      catch {
        case e: Throwable =>
          // any uncaught exception during queue message processing will be stored as well
          // so that the data pipeline can choose to handle the errors
          val wrapper = new BondSerializableWrapper[EventEnvelope](null, classOf[EventEnvelope])
          wrapper.setException(e)
          store(wrapper)
      }
    }
  }
}

谢谢,

-Tao

0 个答案:

没有答案