如何解决与当前可用偏移量不同的当前已提交偏移量?

时间:2020-07-31 12:35:47

标签: java scala maven apache-spark apache-kafka

我正在尝试使用Spark Streaming从Kafka读取avro数据,但是收到以下错误消息:

Streaming Query Exception caught!: org.apache.spark.sql.streaming.StreamingQueryException: Job aborted.
=== Streaming Query ===
Identifier: [id = 8b54c92d-6bbc-4dbc-84d0-55b762c21ba2, runId = 4bc92b3c-343e-4886-b0bc-0777b89f9ec8]
Current Committed Offsets: {KafkaV2[Subscribe[customer-avro4]]: {"customer-avro":{"0":17}}}
Current Available Offsets: {KafkaV2[Subscribe[customer-avro4]]: {"customer-avro":{"0":20}}}

Current State: ACTIVE
Thread State: RUNNABLE

关于这个问题可能是什么以及如何解决的任何想法?代码如下(灵感来自xebia-france spark-structured-streaming-blog)。实际上,我认为它已经运行得更早了,但是现在出现了问题。

import com.databricks.spark.avro.SchemaConverters
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.StreamingQueryException

object AvroConsumer {
  private val topic = "customer-avro4"
  private val kafkaUrl = "http://localhost:9092"
  private val schemaRegistryUrl = "http://localhost:8081"

  private val schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
  private val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)

  private val avroSchema = schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
  private val sparkSchema = SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("ConfluentConsumer")
      .master("local[*]")
      .getOrCreate()

    spark.sparkContext.setLogLevel("ERROR")

    spark.udf.register("deserialize", (bytes: Array[Byte]) =>
      DeserializerWrapper.deserializer.deserialize(bytes)
    )

    val kafkaDataFrame = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaUrl)
      .option("subscribe", topic)
      .load()

    val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")

    import org.apache.spark.sql.functions._

    val formattedDataFrame = valueDataFrame.select(
      from_json(col("message"), sparkSchema.dataType).alias("parsed_value"))
      .select("parsed_value.*")

    val writer = formattedDataFrame
      .writeStream
      .format("parquet")
      .option("checkpointLocation", "hdfs://localhost:9000/data/spark/parquet/checkpoint")
    while (true) {
      val query = writer.start("hdfs://localhost:9000/data/spark/parquet/total")

      try {
        query.awaitTermination()
      }
      catch {
        case e: StreamingQueryException => println("Streaming Query Exception caught!: " + e);
      }
    }
  }

  object DeserializerWrapper {
    val deserializer: AvroDeserializer = kafkaAvroDeserializer
  }

  class AvroDeserializer extends AbstractKafkaAvroDeserializer {
    def this(client: SchemaRegistryClient) {
      this()
      this.schemaRegistry = client
    }

    override def deserialize(bytes: Array[Byte]): String = {
      val genericRecord = super.deserialize(bytes).asInstanceOf[GenericRecord]
      genericRecord.toString
    }
  }

}

1 个答案:

答案 0 :(得分:0)

弄清楚了-问题不是我直接考虑的Spark-Kafka集成问题,而是hdfs文件系统中的检查点信息。删除并重新创建hdfs中的checkpoint文件夹为我解决了这个问题。