启动时,我的spark Kafka Direct Stream作业仅使用在作业启动之前生成的消息。即使我可以在Kafka中看到这些新生成的消息,在作业运行后生成的任何消息都不会被消耗,直到重新启动该作业。 公共课程测试{
private static Logger logger = Logger.getLogger(Test.class.getName());
public Test() {
}
/**
* DOCUMENT ME!
*
* @param args
*/
public static void main(final String[] args) {
if ((args != null) && (args.length > 0) && (args[0] != null)) {
final SparkJobConfig sparkJobConfig = PcsSparkJobUtil.loadJobConfig(
args[0]);
final SparkConf sparkConf = new SparkConf();
SparkJobUtils.initializeSparkConf(sparkJobConfig, sparkConf, sparkJobConfig.getSparkJobName());
final JavaStreamingContext jssc = new JavaStreamingContext(
sparkConf, new Duration(sparkJobConfig.getSparkjobStreamingKafkaPollingDuration()));
final Map<String, Integer> topicMap = new HashMap<>();
final CassandraConnector cassandraConnector = CassandraConnector.apply(sparkConf);
final Session session = cassandraConnector.openSession();
session.execute("USE " + sparkJobConfig.getCassandraKeyspace());
HashMap<TopicAndPartition, Long> kafkaTopicPartition = new HashMap<TopicAndPartition, Long>();
Map<String, String> kafkaParamMap = KafkaInternalUtils.getConsumerProperties(sparkJobConfig.getElasticSearchConfig().getEsKafkaProducerUrl(),
sparkJobConfig.getElasticSearchConfig().getPushToESKafkaTopic(), sparkJobConfig.getElasticSearchConfig().getEsKafkaConsumerStartFromSmallestOffset(), sparkJobConfig);
Set<String> topicSet = KafkaInternalUtils.getTopicSet(sparkJobConfig.getElasticSearchConfig().getPushToESKafkaTopic());
JavaDStream<String> listingIdsDStream;
List<KafkaEventLog> offsetsListCassandra = KafkaInternalUtils.getDataFromKafkaLogTable(session);
/**
* If No Offsets Create Fresh Stream
*/
if(offsetsListCassandra == null || offsetsListCassandra.isEmpty()){
listingIdsDStream = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class,
StringDecoder.class, kafkaParamMap, topicSet).transformToPair(stringStringJavaPairRDD -> {
if(!stringStringJavaPairRDD.isEmpty()){
final OffsetRange[] offsets = ((HasOffsetRanges) stringStringJavaPairRDD.rdd()).offsetRanges();
writeOffsetsFormingSession(sparkJobConfig, offsets);
}
return stringStringJavaPairRDD;
}).map(stringStringTuple2 -> stringStringTuple2._2());
}else{
/**
* Create Stream From Offsets
*/
for (KafkaEventLog eventLog : offsetsListCassandra) {
kafkaTopicPartition.put(new TopicAndPartition(sparkJobConfig.getElasticSearchConfig().getPushToESKafkaTopic(),
Integer.parseInt(eventLog.getPartition_number())),
Long.parseLong(eventLog.getSet_from_offset()));
}
JavaInputDStream<String> stream = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class,
String.class, kafkaParamMap, kafkaTopicPartition, (messageAndMetadata) -> messageAndMetadata.message());
listingIdsDStream = stream.transform(rdd -> {
if(!rdd.isEmpty()){
final OffsetRange[] offsetsForThisBatch = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
writeOffsetsFormingSession(sparkJobConfig, offsetsForThisBatch);
}
return rdd;
});
}
listingIdsDStream.foreachRDD(stringJavaRDD1 -> {
stringJavaRDD1.foreachPartition(stringIterator -> {
while(stringIterator.hasNext()) {
String val = stringIterator.next();
System.out.println("Danish Vale:-" + val);
logger.info("Danish Vale:-" + val);
}
});
});
jssc.start();
try {
jssc.awaitTermination();
}catch (Exception e){
logger.error("Exception termination job ", e);
}
}
}
private static void writeOffset(Session session, final OffsetRange[] offsets) {
logger.info("Danish Offsets written");
for (OffsetRange offsetRange : offsets) {
KafkaEventLog eventLog = new KafkaEventLog();
eventLog.setTopic_name(String.valueOf(offsetRange.topic()));
eventLog.setPartition_number(String.valueOf(offsetRange.partition()));
eventLog.setSet_from_offset(String.valueOf(offsetRange.fromOffset()));
eventLog.setSet_until_offset(String.valueOf(offsetRange.untilOffset()));
eventLog.setInsert_timestamp(new java.sql.Date(new Date().getTime()));
eventLog.setInserted_by("ESKafkaPush");
KafkaInternalUtils.insertIntoKafkaEventLogTable(eventLog, session);
}
}
public static void writeOffsetsFormingSession(SparkJobConfig sparkJobConfig, OffsetRange[] offsets){
Session session1 = null;
try {
session1 = SparkJobUtils.getCassandraSession(sparkJobConfig);
writeOffset(session1, offsets);
}catch (Exception e){
logger.error("Exception processing kafka offsets", e);
}finally {
if(session1 != null){
session1.close();
session1.getCluster().close();
}
}
}
}
这是我的KafkaConsumer创建方法 公共静态地图getConsumerProperties(String ConsumerBrokerUrl,String topicName,Boolean setSmallestOffset,SparkJobConfig sparkJobConfig){
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", consumerBrokerUrl);
kafkaParams.put("group.id", "kafka_es_push");
//kafkaParams.put("zookeeper.connect", sparkJobConfig.getElasticSearchConfig().getEsKafkaConsumerUrl());
if(setSmallestOffset) {
kafkaParams.put("auto.offset.reset", "smallest");
}
return kafkaParams;
}
具有讽刺意味的是,当在本地运行时,这一切都可以正常运行。但是,当我将其部署到我的Spark Mesos群集中时,它只会消耗以前生成的消息。