我们正在尝试反序列化Confluent Platform的一些Avro消息
关于我们可能做错了什么的任何想法?
由于某种原因,我们总是得到meetupRDD.isEmpty
package com.rsvps;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.clients.consumer.OffsetCommitCallback;
import org.apache.kafka.common.TopicPartition;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.*;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.CanCommitOffsets;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.HasOffsetRanges;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.apache.spark.streaming.kafka010.OffsetRange;
public class SparkDataset {
private static final String HADOOP_HOME_DIR_VALUE = "C:/hadoop";
private static final String RUN_LOCAL_WITH_AVAILABLE_CORES = "local[*]";
private static final String APPLICATION_NAME = "Kafka <- Spark(Dataset) -> MongoDb";
private static final int BATCH_DURATION_INTERVAL_MS = 5000;
private static final Map<String, Object> KAFKA_CONSUMER_PROPERTIES;
private static final String KAFKA_BROKERS = "localhost:31090";
private static final String SCHEMA_REGISTRY_URL = "https://localhost:8180";
private static final String KAFKA_OFFSET_RESET_TYPE = "latest";
private static final String KAFKA_GROUP = "kafka-group";
private static final String KAFKA_TOPIC = "jdbc-pwcrd-V_CHAIN";
private static final Collection<String> TOPICS =
Collections.unmodifiableList(Arrays.asList(KAFKA_TOPIC));
static {
Map<String, Object> kafkaProperties = new HashMap<>();
kafkaProperties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, KAFKA_BROKERS);
kafkaProperties.put("schema.registry.url", SCHEMA_REGISTRY_URL);
kafkaProperties.put("specific.avro.reader", true);
kafkaProperties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
kafkaProperties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "io.confluent.kafka.serializers.KafkaAvroDeserializer");
kafkaProperties.put(ConsumerConfig.GROUP_ID_CONFIG, KAFKA_GROUP);
kafkaProperties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, KAFKA_OFFSET_RESET_TYPE);
kafkaProperties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false);
KAFKA_CONSUMER_PROPERTIES = Collections.unmodifiableMap(kafkaProperties);
}
public static void main(String[] args) throws InterruptedException {
System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);
final SparkConf conf = new SparkConf()
.setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
.setAppName(APPLICATION_NAME);
final JavaStreamingContext streamingContext = new JavaStreamingContext(conf,
new Duration(BATCH_DURATION_INTERVAL_MS));
final SparkSession sparkSession = SparkSession.builder().config(conf).getOrCreate();
final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES));
final JavaDStream<String> meetupStreamValues = meetupStream.map(ConsumerRecord::value);
meetupStreamValues.foreachRDD((JavaRDD<String> meetupRDD) -> {
if (!meetupRDD.isEmpty()) {
Dataset<Row> row = sparkSession.read()
.json(sparkSession.createDataset(meetupRDD.rdd(), Encoders.STRING()));
row.printSchema();
row.write().mode(SaveMode.Append).format("json").save("test.json");
}
});
// some time later, after outputs have completed
meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {
OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();
((CanCommitOffsets) meetupStream.inputDStream())
.commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
});
streamingContext.start();
streamingContext.awaitTermination();
}
}
final class MeetupOffsetCommitCallback implements OffsetCommitCallback {
private static final Logger log = Logger.getLogger(MeetupOffsetCommitCallback.class.getName());
@Override
public void onComplete(Map<TopicPartition, OffsetAndMetadata> offsets, Exception exception) {
log.info("---------------------------------------------------");
log.log(Level.INFO, "{0} | {1}", new Object[] { offsets, exception });
log.info("---------------------------------------------------");
}
}