KafkaProducer作为Kafka-Spark Integration中的广播变量

时间:2016-10-01 19:09:44

标签: apache-spark apache-kafka spark-streaming kafka-producer-api apache-spark-standalone

我正在尝试从Kafka读取并使用Spark将数据推送到另一个kakfa队列。

我最初的方法是为RDD分区中的每条记录创建KafkaProducer对象,它运行正常,但性能明智,它真的很糟糕。

所以我尝试使用Broadcast变量概念将KakfaProducer作为广播变量传递给Executors。它结束了 线程" main"中的例外情况com.esotericsoftware.kryo.KryoException:java.util.ConcurrentModificationException

请解释或修改我的代码,以正确的方式和更好的性能在火花中使用KakfaProducer。

   import java.io.Serializable;
    import java.util.HashMap;
    import java.util.Map;
    import java.util.Properties;
    import java.util.Set;
    import java.util.TreeSet;
    import java.util.concurrent.Future;

    import org.apache.commons.configuration.ConfigurationConverter;
    import org.apache.commons.configuration.ConfigurationException;
    import org.apache.commons.configuration.PropertiesConfiguration;
    import org.apache.kafka.clients.producer.KafkaProducer;
    import org.apache.kafka.clients.producer.Producer;
    import org.apache.kafka.clients.producer.ProducerRecord;
    import org.apache.kafka.clients.producer.RecordMetadata;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.broadcast.Broadcast;
    import org.apache.spark.streaming.Duration;
    import org.apache.spark.streaming.api.java.JavaInputDStream;
    import org.apache.spark.streaming.api.java.JavaStreamingContext;
    import org.apache.spark.streaming.kafka.KafkaUtils;

    import kafka.common.TopicAndPartition;
    import kafka.message.MessageAndMetadata;
    import kafka.serializer.StringDecoder;

    public class MyService implements Serializable {


        private static final long serialVersionUID = 1L;
        private PropertiesConfiguration props;
        private Producer<String, String> producer = null;
        private Future<RecordMetadata> receipt = null;
        private RecordMetadata receiptInfo = null;

        public void setProperties() {

            try {
                props = new PropertiesConfiguration("/conf/empty.properties");
            } catch (ConfigurationException e) {
                // TODO Auto-generated catch block
                System.out.println("Line 51");
                e.printStackTrace();
            }

            if (!props.containsKey("producer.topic")) {
                props.setProperty("producer.topic", "mytopic");
            }

            Properties producerprops = ConfigurationConverter.getProperties(props);

            producerprops.setProperty("bootstrap.servers", props.getString("target.bootstrap.servers"));
            producerprops.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
            producerprops.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); // ????

            this.producer = new KafkaProducer<String, String>(producerprops);

        }

        public void sendmessage(String Value) {

            try {
                System.out.println("Line 111");

                String key = "xyz";

                if (Value.toString() == "20") {
                    receipt = producer
                            .send(new ProducerRecord<String, String>(props.getString("producer.topic"), key, Value));
                }
            } catch (Exception e) {
                e.printStackTrace();
            }

        }


        public static void main(String[] args) {
            String topicNames = "mysourcetopic";
            Set<String> topicSet = new TreeSet<String>();
            for (String topic : topicNames.split(",")) {
                topicSet.add(topic.trim());
            }

            Map<TopicAndPartition, Long> topicMap = new HashMap<TopicAndPartition, Long>();
            for (String topic : topicNames.split(",")) {
                for (int i = 0; i < 2; i++) {

                    TopicAndPartition tp = new TopicAndPartition(topic, i);
                    topicMap.put(tp, 0l);
                }
            }

            JavaSparkContext sparkConf = new JavaSparkContext("**************", "Kafka-Spark");

            MyService ec = new MyService();
            ec.setProperties();

            final Broadcast<Producer> bCastProducer = sparkConf.broadcast(ec.producer);

            sparkConf.getConf().set("spark.local.ip", "abcddd");

            sparkConf.getConf().set("spark.eventLog.enabled", "false");
            sparkConf.getConf().set("spark.shuffle.blockTransferService", "nio");

            JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, new Duration(10000));

            Map<String, String> kafkaParams = new HashMap<String, String>();
            String pollInterval = "10000";
            String zookeeper = "xyzzz";
            int partition = 1;
            kafkaParams.put("metadata.broker.list", "xyzzz");
            kafkaParams.put("group.id", "Consumer");
            kafkaParams.put("client.id", "Consumer");
            kafkaParams.put("zookeeper.connect", zookeeper);
            JavaInputDStream<String> dfs = KafkaUtils.createDirectStream(jsc, String.class, String.class,
                    StringDecoder.class, StringDecoder.class, String.class, kafkaParams, topicMap,
                    (Function<MessageAndMetadata<String, String>, String>) MessageAndMetadata::message);

            dfs.foreachRDD(rdd -> {
                if (rdd.isEmpty()) {
                    return;
                }

                rdd.foreachPartition(itr -> {
                    try {
                        // System.out.println("231");

                        while (itr.hasNext()) {
                            ec.sendmessage(itr.next()); // Produce

                        }

                    } catch (Exception e) {
                    }
                });
            });
            jsc.start();
            jsc.awaitTermination();
        }

    }

1 个答案:

答案 0 :(得分:0)

在之前的Stack Overflow讨论中,请参阅my answer基本相同的问题(How to write to Kafka from Spark Streaming)。