Question

我有2个Kafka流，我想按某个键进行合并，并在合并的流之上执行状态操作，以便可以对两个流的计数求和

这是我尝试过但辛苦的工作..


PCollection<String> stream1 = .. read from kafka

PCollection<String> stream2 = .. read from kafka

PCollection<String,Long>  wonrdCount1 =  stream1.apply(...)

PCollection<String,Long>  wonrdCount2 =  stream2.apply(...)

PCollection<String,Long> merged = merge wordcount1 and wordcount2 using CoGroupByKey

Pcolection<String,Long> finalStream = mergred.apply(...)

适用于finalstream的应用状态

Answer 1

public class KafkaWordCount implements Serializable {
  private String kafkaBrokers =null;
  private  String topic =null;

  public KafkaWordCount(String brokers, String topic){
    this.kafkaBrokers =brokers;
    this.topic =topic;
  }

  public PCollection<KV<String,Long>> build(Pipeline p){
    final String myState="HELLO";

    PCollection<KV<String,Long>> res = 
      p.apply(KafkaIO.<Long, String>read()
                .withBootstrapServers(this.kafkaBrokers )
                .withTopic(this.topic)
                .withKeyDeserializer(LongDeserializer.class)
                .withValueDeserializer(StringDeserializer.class))
       .apply(ParDo.of(new DoFn<KafkaRecord<Long, String>, String>() {
          @ProcessElement
          public void processElement(ProcessContext processContext) {
             KafkaRecord<Long, String> record = processContext.element();
             processContext.output(record.getKV().getValue());
          }
        }))
       .apply("ExtractWords",
          ParDo.of(new DoFn<String, KV<String, Long>>() {
             @ProcessElement
             public void processElement(ProcessContext c) {
                for (String word : c.element().split("[^\\p{L}]+")) {
                  if (!word.isEmpty()) {
                    c.output(KV.of(word,1L));
                  }
                }
             }
           }));

    return  res;
  }
}

public class DataPipe {
  public static void main(String[] args) {
    final String stateId = "myMapState";
    final String myState = "myState";

    PipelineOptions options = PipelineOptionsFactory.create();
    options.as(FlinkPipelineOptions.class).setRunner(FlinkRunner.class);

    Pipeline p = Pipeline.create(options);

    PCollection<KV<String,Long>> stream1 = 
      new KafkaWordCount("localhost:9092","idm")
        .build(p)
        .apply(
          Window
            .<KV<String,Long>>into(
              FixedWindows.of(Duration.millis(3600000)))
            .triggering(
               Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
                         .withAllowedLateness(Duration.ZERO)
                         .discardingFiredPanes());

    PCollection<KV<String,Long>> stream2 =
      new KafkaWordCount("localhost:9092","assist")
        .build(p)
        .apply(
          Window
            .<KV<String,Long>>into(
              FixedWindows.of(Duration.millis(3600000)))
            .triggering(
               Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
                         .withAllowedLateness(Duration.ZERO)
                         .discardingFiredPanes());

    final TupleTag<Long> web = new TupleTag<Long>();
    final TupleTag<Long> assist = new TupleTag<Long>();

    PCollection<KV<String, CoGbkResult>> joinedStream =
        KeyedPCollectionTuple.of(web, stream1)
                             .and(assist, stream2)
                             .apply(CoGroupByKey.<String>create());

    PCollection<KV<String,Long>> finalCountStream = 
      joinedStream
        .apply(ParDo.of(
          new DoFn<KV<String, CoGbkResult>, KV<String,Long>>() {
            @StateId(stateId)
            private final StateSpec<MapState<String, Long>> mapState =
                                 StateSpecs.map();
            @ProcessElement
            public void processElement(
               ProcessContext processContext,
               @StateId(stateId) MapState<String, Long> state) {

                 KV<String,CoGbkResult> element = processContext.element();
                 Iterable<Long> count1 = element.getValue().getAll(web);
                 Iterable<Long> count2 = element.getValue().getAll(assist);
                 Long sumAmount =
                   StreamSupport
                     .stream(
                       Iterables
                         .concat(count1, count2)
                         .spliterator(),
                       false)
                     .collect(Collectors.summingLong(n -> n));
                System.out.println(element.getKey()+"::"+sumAmount);
                //  processContext.output(element.getKey()+"::"+sumAmount);

                Long currCount = state.get(element.getKey()).read()==null? 0L:state.get(element.getKey()).read();
                Long newCount = currCount+sumAmount;
                state.put(element.getKey(),sumAmount);
                processContext.output(KV.of(element.getKey(),sumAmount));
            }
        }));


    finalCountStream
      .apply(ParDo.of(new DoFn<KV<String,Long>, KV<String,Long>>() {
        @ProcessElement
        public void processElement(ProcessContext processContext) {
                processContext.output(processContext.element());
            }
        }))
      .apply("finalState", ParDo.of(new DoFn<KV<String,Long>, String>() {
        @StateId(myState)
        private final StateSpec<MapState<String, Long>> mapState =
          StateSpecs.map();

        @ProcessElement
        public void processElement(
          ProcessContext c,
          @StateId(myState) MapState<String, Long> state){

            KV<String,Long> e = c.element();
            System.out.println("Thread ID :"
                        + Thread.currentThread().getId());
            Long currCount =
              state.get(e.getKey()).read()==null
                    ? 0L
                    : state.get(e.getKey()).read();
            Long newCount = currCount+e.getValue();
            state.put(e.getKey(),newCount);
            c.output(e.getKey()+":"+newCount);
          }
        }))
      .apply(KafkaIO.<Void, String>write()
                        .withBootstrapServers("localhost:9092")
                        .withTopic("test")
                        .withValueSerializer(StringSerializer.class)
                        .values());

        /* finalCountStream.apply(KafkaIO.<Void, String>write()
                 .withBootstrapServers("localhost:9092")
                 .withTopic("test")
                 .withValueSerializer(StringSerializer.class)
                 .values()
         );*/

    //finalCountStream.apply(TextIO.write().to("wordcounts"));

    p.run().waitUntilFinish();
  }
}

该Beam管道从两个kafka流中读取文本，将其拆分为单词，并基于单词合并两个流，最后从两个流中向另一个kafka主题发出单词计数

Answer 2

供参考：如果我对问题的理解正确，则可以使用KafkaIO.withTopics(List<String>)简化管道的第一部分，并一步步读取两个（或多个）主题。因此，之后无需再加入来自不同主题的数据。

如何使用Apache Beam合并两个流并在合并的流上执行状态操作

2 个答案: