火花流和&卡夫卡:直接接近 - 一些混淆

时间:2015-09-25 08:41:38

标签: apache-spark spark-streaming

最近,我正在尝试将spark streaming和kafka与其直接方法相结合。在我的应用中,我需要定期更新来自elasticsearch的 List 数据集加载。我写了一个线程来执行此操作,但是当我在我的spark群集中运行应用程序时, List 没有更新。

但是,当我使用基于接收器的方法集成spark streaming和kafka并使用相同的线程时, List 已更新。

有人可以向我解释一下吗? BTW,有没有一种有效的方法(周期性更新数据集)?谢谢!

以下是我的一些代码: 直接接近主要方法:

if (args.length < 2) {
    System.err.println("Usage: DirectKafkaWordCount <brokers> <topics>\n"
    + "  <brokers> is a list of one or more Kafka brokers\n"
    + "  <topics> is a list of one or more kafka topics to consume from\n\n");
    System.exit(1);
}
String brokers = args[0];
String topics = args[1];

// Create context with 2 second batch interval
SparkConf conf = new SparkConf().setAppName("DirectReceiverLogAnalysis");
conf.set("es.index.auto.create", "true");

// Create the streaming context with a 2 second batch size
JavaStreamingContext jssc = new JavaStreamingContext(getContext(conf),new Duration(2000));

// 初始化rule
loadRules(jssc.sparkContext());

// 定期更新rule
RuleThread ruleThread = new RuleThread(jssc.sparkContext());
ruleThread.start();

HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(",")));

HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", brokers);

// Create direct kafka stream with brokers and topics
JavaPairInputDStream<String, String> messages = KafkaUtils
    .createDirectStream(jssc, String.class, String.class,
    StringDecoder.class, StringDecoder.class, kafkaParams,topicsSet);

// Get the lines
JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>,     String>() {
        public String call(Tuple2<String, String> tuple2) {
                        return tuple2._2();
                    }
                });

        // get the windowDstream
        JavaDStream<String> windowed_messageDStream = lines.window(
                new Duration(6000), new Duration(4000));

        // dispose of the streams with rules
        windowed_messageDStream
                .foreachRDD(new Function<JavaRDD<String>, Void>() {

                    public Void call(JavaRDD<String> rdd) throws Exception {
                        // TODO Auto-generated method stub
                        if (rdd.count() > 0) {

                            SQLContext sqlContext = JavaSQLContextSingleton
                                    .getInstance(rdd.context());

                            DataFrame json_rdd = sqlContext.jsonRDD(rdd);

                            json_rdd.registerTempTable("logs");

                            json_rdd.printSchema();

                            System.out
                                    .println("---------------size-------------"
                                            + sqlList.size());
                            // 数据匹配
                            for (int i = 0; i < sqlList.size(); i++) {

                                try {
                                    DataFrame result = sqlContext.sql(sqlList
                                            .get(i));


                                    if (result.count() > 0) {
                                        JavaEsSparkSQL.saveToEs(result,
                                                "matching/logs");
                                        }
                                } catch (Exception e) {
                                    // TODO Auto-generated catch block

                                    e.printStackTrace();

                                }

                            }

                        }

                        return null;
                    }
                });

        // Start the computation
        jssc.start();
        jssc.awaitTermination();

        // 终止线程
        ruleThread.setStop(true);

    }

    // 加载rules
    public static void loadRules(JavaSparkContext context) {
        // 初始化rule
        JavaRDD<Map<String, Object>> esRDD = JavaEsSpark.esRDD(context,
                "sql_rules/logs").values();

        // collect the rules to list
        sqlList = esRDD.map(new Function<Map<String, Object>, String>() {

            public String call(Map<String, Object> map) throws Exception {

                return (String) map.get("sql");
            }
        }).collect();
    }


The thread class:
public class RuleThread extends Thread {

    public boolean threadStop = false;

    public JavaSparkContext context;

    public RuleThread(JavaSparkContext context) {

        this.context = context;
    }

    public void setStop(boolean stop) {

        this.threadStop = stop;

    }

    @Override
    public void run() {

        try {
            while (!threadStop) {
                Thread.sleep(1000 * 60 * 1);
                LogAnalysis.loadRules(context);

            }

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

}

0 个答案:

没有答案