我通过Kafka流式处理mongo oplog数据。我正在使用Debezium CDC Kafka连接器拖尾mongo oplog。
架构注册表使用AvroConverter转换器对键和值进行序列化
bootstrap.servers = localhost:9092
Kafka key.converter = io.confluent.connect.avro.AvroConverter key.converter.schema.registry.url = http://localhost:8081 value.converter = io.confluent.connect.avro.AvroConverter value.converter.schema.registry.url = http://localhost:8081
internal.key.converter = org.apache.kafka.connect.json.JsonConverter internal.value.converter = org.apache.kafka.connect.json.JsonConverter internal.key.converter.schemas.enable = false internal.value.converter.schemas.enable = false
offset.storage.file.filename = / tmp / connect.offsets
下面的代码流化Kafka数据并使用 KafkaAvroDeserializer
将其反序列化import io.confluent.kafka.schemaregistry.client.rest.RestService
import io.confluent.kafka.serializers.KafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.spark.sql.SparkSession
import scala.collection.JavaConverters._
object KafkaStream{
case class DeserializedFromKafkaRecord(key: String, value: String)
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession
.builder
.master("local[*]")
.appName("kafka")
.getOrCreate()
//sparkSession.sparkContext.setLogLevel("ERROR")
import sparkSession.implicits._
val schemaRegistryURL = "http://127.0.0.1:8081"
val topicName = "prodCollection.inventory.Prod"
val subjectValueName = topicName + "-value"
//create RestService object
val restService = new RestService(schemaRegistryURL)
//.getLatestVersion returns io.confluent.kafka.schemaregistry.client.rest.entities.Schema object.
val valueRestResponseSchema = restService.getLatestVersion(subjectValueName)
//Use Avro parsing classes to get Avro Schema
val parser = new Schema.Parser
val topicValueAvroSchema: Schema = parser.parse(valueRestResponseSchema.getSchema)
//key schema is typically just string but you can do the same process for the key as the value
val keySchemaString = "\"string\""
val keySchema = parser.parse(keySchemaString)
//Create a map with the Schema registry url.
//This is the only Required configuration for Confluent's KafkaAvroDeserializer.
val props = Map("schema.registry.url" -> schemaRegistryURL)
//Declare SerDe vars before using Spark structured streaming map. Avoids non serializable class exception.
var keyDeserializer: KafkaAvroDeserializer = null
var valueDeserializer: KafkaAvroDeserializer = null
//Create structured streaming DF to read from the topic.
val rawTopicMessageDF = sparkSession.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", topicName)
.option("startingOffsets", "earliest")
.option("key.deserializer","KafkaAvroDeserializer")
.option("value.deserializer","KafkaAvroDeserializer")
//.option("maxOffsetsPerTrigger", 20) //remove for prod
.load()
rawTopicMessageDF.printSchema()
//instantiate the SerDe classes if not already, then deserialize!
val deserializedTopicMessageDS = rawTopicMessageDF.map{
row =>
if (keyDeserializer == null) {
keyDeserializer = new KafkaAvroDeserializer
keyDeserializer.configure(props.asJava, true) //isKey = true
}
if (valueDeserializer == null) {
valueDeserializer = new KafkaAvroDeserializer
valueDeserializer.configure(props.asJava, false) //isKey = false
}
//Pass the Avro schema.
val deserializedKeyString = keyDeserializer.deserialize(topicName, row.getAs[Array[Byte]]("key"), keySchema).toString //topic name is actually unused in the source code, just required by the signature. Weird right?
val deserializedValueJsonString = valueDeserializer.deserialize(topicName, row.getAs[Array[Byte]]("value"), topicValueAvroSchema).toString
DeserializedFromKafkaRecord(deserializedKeyString, deserializedValueJsonString)
}
deserializedTopicMessageDS.printSchema()
deserializedTopicMessageDS.writeStream
.outputMode("append")
.format("console")
.option("truncate", false)
.start()
deserializedTopicMessageDS数据集架构已根据需要进行了转换,但由于以下信息而导致流停止,
root
|-- key: binary (nullable = true)
|-- value: binary (nullable = true)
|-- topic: string (nullable = true)
|-- partition: integer (nullable = true)
|-- offset: long (nullable = true)
|-- timestamp: timestamp (nullable = true)
|-- timestampType: integer (nullable = true)
root
|-- key: string (nullable = true)
|-- value: string (nullable = true)
18/08/13 22:53:54 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
18/08/13 22:53:54 INFO StreamExecution: Starting [id = b1fb3ce2-08d0-4d87-b031-af129432d91a, runId = 38b66e4a-040f-42c8-abbe-bc27fa3b9462]. Use /private/var/folders/zf/6dh44_fx1sn2dp2w7d_54wg80000gn/T/temporary-ae7a93f6-0307-4f39-ba44-93d5d3d7c0ab to store the query checkpoint.
18/08/13 22:53:54 INFO SparkContext: Invoking stop() from shutdown hook
18/08/13 22:53:54 INFO SparkUI: Stopped Spark web UI at http://192.168.0.100:4040
18/08/13 22:53:54 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
18/08/13 22:53:54 INFO MemoryStore: MemoryStore cleared
18/08/13 22:53:54 INFO BlockManager: BlockManager stopped
18/08/13 22:53:54 INFO BlockManagerMaster: BlockManagerMaster stopped
18/08/13 22:53:54 INFO ConsumerConfig: ConsumerConfig values:
auto.commit.interval.ms = 5000
auto.offset.reset = earliest
bootstrap.servers = [localhost:9092]
check.crcs = true
client.id =
connections.max.idle.ms = 540000
default.api.timeout.ms = 60000
enable.auto.commit = false
exclude.internal.topics = true
fetch.max.bytes = 52428800
fetch.max.wait.ms = 500
fetch.min.bytes = 1
group.id = spark-kafka-source-b9f0f64b-952d-4733-ba3e-aa753954b2ef--1115279952-driver-0
heartbeat.interval.ms = 3000
interceptor.classes = []
internal.leave.group.on.close = true
isolation.level = read_uncommitted
key.deserializer = class org.apache.kafka.common.serialization.ByteArrayDeserializer
max.partition.fetch.bytes = 1048576
max.poll.interval.ms = 300000
max.poll.records = 1
metadata.max.age.ms = 300000
metric.reporters = []
metrics.num.samples = 2
metrics.recording.level = INFO
metrics.sample.window.ms = 30000
partition.assignment.strategy = [class org.apache.kafka.clients.consumer.RangeAssignor]
receive.buffer.bytes = 65536
reconnect.backoff.max.ms = 1000
reconnect.backoff.ms = 50
request.timeout.ms = 30000
retry.backoff.ms = 100
sasl.client.callback.handler.class = null
sasl.jaas.config = null
sasl.kerberos.kinit.cmd = /usr/bin/kinit
sasl.kerberos.min.time.before.relogin = 60000
sasl.kerberos.service.name = null
sasl.kerberos.ticket.renew.jitter = 0.05
sasl.kerberos.ticket.renew.window.factor = 0.8
sasl.login.callback.handler.class = null
sasl.login.class = null
sasl.login.refresh.buffer.seconds = 300
sasl.login.refresh.min.period.seconds = 60
sasl.login.refresh.window.factor = 0.8
sasl.login.refresh.window.jitter = 0.05
sasl.mechanism = GSSAPI
security.protocol = PLAINTEXT
send.buffer.bytes = 131072
session.timeout.ms = 10000
ssl.cipher.suites = null
ssl.enabled.protocols = [TLSv1.2, TLSv1.1, TLSv1]
ssl.endpoint.identification.algorithm = https
ssl.key.password = null
ssl.keymanager.algorithm = SunX509
ssl.keystore.location = null
ssl.keystore.password = null
ssl.keystore.type = JKS
ssl.protocol = TLS
ssl.provider = null
ssl.secure.random.implementation = null
ssl.trustmanager.algorithm = PKIX
ssl.truststore.location = null
ssl.truststore.password = null
ssl.truststore.type = JKS
value.deserializer = class org.apache.kafka.common.serialization.ByteArrayDeserializer
18/08/13 22:53:54 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
18/08/13 22:53:54 INFO SparkContext: Successfully stopped SparkContext
18/08/13 22:53:54 INFO ShutdownHookManager: Shutdown hook called
18/08/13 22:53:54 INFO ShutdownHookManager: Deleting directory /private/var/folders/zf/6dh44_fx1sn2dp2w7d_54wg80000gn/T/spark-e1c2b259-39f2-4d65-9919-74ab1ad6acae
18/08/13 22:53:54 INFO ShutdownHookManager: Deleting directory /private/var/folders/zf/6dh44_fx1sn2dp2w7d_54wg80000gn/T/temporary-ae7a93f6-0307-4f39-ba44-93d5d3d7c0ab
Process finished with exit code 0