Spark Kafka Direct Stream不使用新产生的消息(Saprk2.3.1 kafak10)

时间:2018-09-22 01:10:49

标签: apache-spark apache-kafka spark-streaming kafka-consumer-api

启动时,我的spark Kafka Direct Stream作业仅使用在作业启动之前生成的消息。即使我可以在Kafka中看到这些新生成的消息,在作业运行后生成的任何消息都不会被消耗,直到重新启动该作业。     公共课程测试{

private static Logger logger = Logger.getLogger(Test.class.getName());

public Test() {
}

/**
 * DOCUMENT ME!
 *
 * @param  args
 */
public static void main(final String[] args) {
    if ((args != null) && (args.length > 0) && (args[0] != null)) {
        final SparkJobConfig sparkJobConfig = PcsSparkJobUtil.loadJobConfig(
                args[0]);

        final SparkConf sparkConf = new SparkConf();

        SparkJobUtils.initializeSparkConf(sparkJobConfig, sparkConf, sparkJobConfig.getSparkJobName());

        final JavaStreamingContext jssc = new JavaStreamingContext(
                sparkConf, new Duration(sparkJobConfig.getSparkjobStreamingKafkaPollingDuration()));
        final Map<String, Integer> topicMap = new HashMap<>();

        final CassandraConnector cassandraConnector = CassandraConnector.apply(sparkConf);
        final Session session = cassandraConnector.openSession();
        session.execute("USE " + sparkJobConfig.getCassandraKeyspace());


        HashMap<TopicAndPartition, Long> kafkaTopicPartition = new HashMap<TopicAndPartition, Long>();

        Map<String, String> kafkaParamMap = KafkaInternalUtils.getConsumerProperties(sparkJobConfig.getElasticSearchConfig().getEsKafkaProducerUrl(),
                sparkJobConfig.getElasticSearchConfig().getPushToESKafkaTopic(), sparkJobConfig.getElasticSearchConfig().getEsKafkaConsumerStartFromSmallestOffset(), sparkJobConfig);
        Set<String> topicSet = KafkaInternalUtils.getTopicSet(sparkJobConfig.getElasticSearchConfig().getPushToESKafkaTopic());


        JavaDStream<String> listingIdsDStream;
        List<KafkaEventLog> offsetsListCassandra = KafkaInternalUtils.getDataFromKafkaLogTable(session);

        /**
         * If No Offsets Create Fresh Stream
         */
        if(offsetsListCassandra == null || offsetsListCassandra.isEmpty()){
            listingIdsDStream = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class,
                    StringDecoder.class, kafkaParamMap, topicSet).transformToPair(stringStringJavaPairRDD -> {
                if(!stringStringJavaPairRDD.isEmpty()){
                    final OffsetRange[] offsets = ((HasOffsetRanges) stringStringJavaPairRDD.rdd()).offsetRanges();
                    writeOffsetsFormingSession(sparkJobConfig, offsets);
                }
                return stringStringJavaPairRDD;
            }).map(stringStringTuple2 -> stringStringTuple2._2());

        }else{

            /**
             * Create Stream From Offsets
             */
            for (KafkaEventLog eventLog : offsetsListCassandra) {
                kafkaTopicPartition.put(new TopicAndPartition(sparkJobConfig.getElasticSearchConfig().getPushToESKafkaTopic(),
                                Integer.parseInt(eventLog.getPartition_number())),
                        Long.parseLong(eventLog.getSet_from_offset()));
            }
            JavaInputDStream<String> stream = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class,
                    String.class, kafkaParamMap, kafkaTopicPartition, (messageAndMetadata) -> messageAndMetadata.message());
            listingIdsDStream = stream.transform(rdd -> {
                if(!rdd.isEmpty()){
                    final OffsetRange[] offsetsForThisBatch = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
                    writeOffsetsFormingSession(sparkJobConfig, offsetsForThisBatch);
                }
                return rdd;
            });

        }

        listingIdsDStream.foreachRDD(stringJavaRDD1 -> {
            stringJavaRDD1.foreachPartition(stringIterator -> {
                while(stringIterator.hasNext()) {
                    String val = stringIterator.next();
                    System.out.println("Danish Vale:-" + val);
                    logger.info("Danish Vale:-" + val);
                }
            });
        });

        jssc.start();
        try {
            jssc.awaitTermination();
        }catch (Exception e){
            logger.error("Exception termination job ", e);
        }

    }

}


private static void writeOffset(Session session, final OffsetRange[] offsets) {
    logger.info("Danish Offsets written");
    for (OffsetRange offsetRange : offsets) {
        KafkaEventLog eventLog = new KafkaEventLog();
        eventLog.setTopic_name(String.valueOf(offsetRange.topic()));
        eventLog.setPartition_number(String.valueOf(offsetRange.partition()));
        eventLog.setSet_from_offset(String.valueOf(offsetRange.fromOffset()));
        eventLog.setSet_until_offset(String.valueOf(offsetRange.untilOffset()));
        eventLog.setInsert_timestamp(new java.sql.Date(new Date().getTime()));
        eventLog.setInserted_by("ESKafkaPush");
        KafkaInternalUtils.insertIntoKafkaEventLogTable(eventLog, session);
    }
}


public static void writeOffsetsFormingSession(SparkJobConfig sparkJobConfig, OffsetRange[] offsets){
    Session session1 = null;
    try {
        session1 = SparkJobUtils.getCassandraSession(sparkJobConfig);
        writeOffset(session1, offsets);
    }catch (Exception e){
        logger.error("Exception processing kafka offsets", e);
    }finally {
        if(session1 != null){
            session1.close();
            session1.getCluster().close();
        }
    }
}

}

这是我的KafkaConsumer创建方法      公共静态地图getConsumerProperties(String ConsumerBrokerUrl,String topicName,Boolean setSmallestOffset,SparkJobConfig sparkJobConfig){

    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", consumerBrokerUrl);
    kafkaParams.put("group.id", "kafka_es_push");

    //kafkaParams.put("zookeeper.connect", sparkJobConfig.getElasticSearchConfig().getEsKafkaConsumerUrl());



    if(setSmallestOffset) {
        kafkaParams.put("auto.offset.reset", "smallest");
    }
    return kafkaParams;
}
具有讽刺意味的是,当在本地运行时,这一切都可以正常运行。但是,当我将其部署到我的Spark Mesos群集中时,它只会消耗以前生成的消息。

0 个答案:

没有答案