我在Spark Streaming接收器中遇到了以下问题。 UI门户在最后一批""记录中显示负数。这是什么意思?这是否意味着,最后一批数据没有完成处理?
管道明细: Spark 1.2.1&上的自定义接收器 Scala 2.10.4
以下是接收器中的代码:(与Azure SDK和Microsoft Bond有点复杂)
class AriaQueueDataReceiver(storageConnectionString: String, queueName: String, blobContainerName: String, threadCount: Int) extends Receiver[BondSerializableWrapper[EventEnvelope]](StorageLevel.MEMORY_ONLY) {
override def onStart(): Unit = {
// start mutliple threads to receive data to maximize IO usage
for(i <- 0 until threadCount) {
new Thread(s"AriaQueueDataReceiver[$i]") {
override def run() {
receive()
}
}.start()
}
}
override def onStop(): Unit = {
}
def commit(envelopes: Iterable[EventEnvelope]): Boolean = {
try {
val wrappers = envelopes.map(e => new BondSerializableWrapper[EventEnvelope](e, classOf[EventEnvelope]))
store(wrappers.iterator)
true
}
catch {
case e: Throwable =>
false
}
}
def processQueueMessage(queueMessage: CloudQueueMessage, queue: CloudQueue, blobContainer: CloudBlobContainer): Unit = {
// read the queue message content
val queueMessageBytes = queueMessage.getMessageContentAsByte
if (queueMessageBytes != null) {
// blob location in the queue message points to the actual data blob
val blobLocation = AriaDataDeserializationUtils.getBlobLocation(queueMessageBytes)
if (blobLocation != null) {
val correctedBlobLocation = if (blobLocation.startsWith(blobContainer.getUri.toString)) blobLocation.substring(blobContainer.getUri.toString.length + 1) else blobLocation
// download and read the data blob
val blob = blobContainer.getBlockBlobReference(correctedBlobLocation)
if (blob.exists()) {
val bufferStream = new ByteArrayOutputStream()
blob.download(bufferStream)
val blobContent = bufferStream.toByteArray
bufferStream.close()
// deserialize the data blob into DataPackage objects
val exceptionBuffer = new ArrayBuffer[Throwable]
val dataPackages = AriaDataDeserializationUtils.deserializeDataPackages(blobContent, exceptionBuffer)
// DataPackage is the schema used inside ARIA while our pipeline is supposed to deal with EventEnvelope schema.
// here we convert DataPackage objects into EventEnvelope objects so that our pipeline logic will only deal with EventEnvelopes.
val events = AriaDataDeserializationUtils.convertDataPackagesToEventEnvelopes(dataPackages)
// try to commit the EventEnvelope objects into Spark Streaming.
// if the commit failed, simply does nothing and the queue message will reappear in the queue in a fixed time period.
if (commit(events)) {
queue.deleteMessage(queueMessage)
blob.delete()
}
// commit the exceptions as well so that the data pipeline can choose to handle the errors
val exceptionWrappers = exceptionBuffer.map(e => {
val wrapper = new BondSerializableWrapper[EventEnvelope](null, classOf[EventEnvelope])
wrapper.setException(e)
wrapper
})
store(exceptionWrappers.iterator)
}
else {
queue.deleteMessage(queueMessage)
val wrapper = new BondSerializableWrapper[EventEnvelope](null, classOf[EventEnvelope])
wrapper.setException(new InternalError(s"Blob could not be found"))
store(wrapper)
}
}
}
}
def receive(): Unit = {
// initialize azure storage client
val storageAccount = CloudStorageAccount.parse(storageConnectionString)
val queueClient = storageAccount.createCloudQueueClient()
val queue = queueClient.getQueueReference(queueName)
val blobClient = storageAccount.createCloudBlobClient()
val blobContainer = blobClient.getContainerReference(blobContainerName)
while(!isStopped()) {
try {
for (queueMessage <- queue.retrieveMessages(32)) {
processQueueMessage(queueMessage, queue, blobContainer)
}
}
catch {
case e: Throwable =>
// any uncaught exception during queue message processing will be stored as well
// so that the data pipeline can choose to handle the errors
val wrapper = new BondSerializableWrapper[EventEnvelope](null, classOf[EventEnvelope])
wrapper.setException(e)
store(wrapper)
}
}
}
}
谢谢,
-Tao