Avro的Java Kafka消费者反序列化

时间:2020-07-20 09:29:22

标签: apache-spark apache-kafka confluent-schema-registry

我们正在尝试反序列化Confluent Platform的一些Avro消息

关于我们可能做错了什么的任何想法?

由于某种原因,我们总是得到meetupRDD.isEmpty


package com.rsvps;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.clients.consumer.OffsetCommitCallback;
import org.apache.kafka.common.TopicPartition;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.*;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.CanCommitOffsets;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.HasOffsetRanges;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.apache.spark.streaming.kafka010.OffsetRange;

public class SparkDataset {

        private static final String HADOOP_HOME_DIR_VALUE = "C:/hadoop";

        private static final String RUN_LOCAL_WITH_AVAILABLE_CORES = "local[*]";
        private static final String APPLICATION_NAME = "Kafka <- Spark(Dataset) -> MongoDb";

        private static final int BATCH_DURATION_INTERVAL_MS = 5000;

        private static final Map<String, Object> KAFKA_CONSUMER_PROPERTIES;
        
        private static final String KAFKA_BROKERS = "localhost:31090";
        private static final String SCHEMA_REGISTRY_URL = "https://localhost:8180";
        private static final String KAFKA_OFFSET_RESET_TYPE = "latest";
        private static final String KAFKA_GROUP = "kafka-group";
        private static final String KAFKA_TOPIC = "jdbc-pwcrd-V_CHAIN";
        private static final Collection<String> TOPICS = 
            Collections.unmodifiableList(Arrays.asList(KAFKA_TOPIC));

        static {
                Map<String, Object> kafkaProperties = new HashMap<>();
                kafkaProperties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, KAFKA_BROKERS);
                kafkaProperties.put("schema.registry.url", SCHEMA_REGISTRY_URL);
                kafkaProperties.put("specific.avro.reader", true);
                kafkaProperties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
                kafkaProperties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaAvroDeserializer");
                kafkaProperties.put(ConsumerConfig.GROUP_ID_CONFIG, KAFKA_GROUP);
                kafkaProperties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, KAFKA_OFFSET_RESET_TYPE);
                kafkaProperties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);

                KAFKA_CONSUMER_PROPERTIES = Collections.unmodifiableMap(kafkaProperties);
        }        

        public static void main(String[] args) throws InterruptedException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                final SparkConf conf = new SparkConf()
                                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                                .setAppName(APPLICATION_NAME);

                final JavaStreamingContext streamingContext = new JavaStreamingContext(conf,
                                new Duration(BATCH_DURATION_INTERVAL_MS));

                final SparkSession sparkSession = SparkSession.builder().config(conf).getOrCreate();

                final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                        KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(),
                                ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES));


                final JavaDStream<String> meetupStreamValues = meetupStream.map(ConsumerRecord::value);

                meetupStreamValues.foreachRDD((JavaRDD<String> meetupRDD) -> {

                        if (!meetupRDD.isEmpty()) {
                                Dataset<Row> row = sparkSession.read()
                                    .json(sparkSession.createDataset(meetupRDD.rdd(), Encoders.STRING()));
                                    
                                row.printSchema();

                                row.write().mode(SaveMode.Append).format("json").save("test.json");
                        }
                });

                // some time later, after outputs have completed
                meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
                OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            
    
                ((CanCommitOffsets) meetupStream.inputDStream())
                    .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
                });

                streamingContext.start();
                streamingContext.awaitTermination();    
        }
}

final class MeetupOffsetCommitCallback implements OffsetCommitCallback {

        private static final Logger log = Logger.getLogger(MeetupOffsetCommitCallback.class.getName());

        @Override
        public void onComplete(Map<TopicPartition, OffsetAndMetadata> offsets, Exception exception) {
                log.info("---------------------------------------------------");
                log.log(Level.INFO, "{0} | {1}", new Object[] { offsets, exception });
                log.info("---------------------------------------------------");
        }
}


0 个答案:

没有答案