我有2个Kafka流,我想按某个键进行合并,并在合并的流之上执行状态操作,以便可以对两个流的计数求和
这是我尝试过但辛苦的工作..
PCollection<String> stream1 = .. read from kafka
PCollection<String> stream2 = .. read from kafka
PCollection<String,Long> wonrdCount1 = stream1.apply(...)
PCollection<String,Long> wonrdCount2 = stream2.apply(...)
PCollection<String,Long> merged = merge wordcount1 and wordcount2 using CoGroupByKey
Pcolection<String,Long> finalStream = mergred.apply(...)
适用于finalstream
的应用状态
答案 0 :(得分:1)
public class KafkaWordCount implements Serializable {
private String kafkaBrokers =null;
private String topic =null;
public KafkaWordCount(String brokers, String topic){
this.kafkaBrokers =brokers;
this.topic =topic;
}
public PCollection<KV<String,Long>> build(Pipeline p){
final String myState="HELLO";
PCollection<KV<String,Long>> res =
p.apply(KafkaIO.<Long, String>read()
.withBootstrapServers(this.kafkaBrokers )
.withTopic(this.topic)
.withKeyDeserializer(LongDeserializer.class)
.withValueDeserializer(StringDeserializer.class))
.apply(ParDo.of(new DoFn<KafkaRecord<Long, String>, String>() {
@ProcessElement
public void processElement(ProcessContext processContext) {
KafkaRecord<Long, String> record = processContext.element();
processContext.output(record.getKV().getValue());
}
}))
.apply("ExtractWords",
ParDo.of(new DoFn<String, KV<String, Long>>() {
@ProcessElement
public void processElement(ProcessContext c) {
for (String word : c.element().split("[^\\p{L}]+")) {
if (!word.isEmpty()) {
c.output(KV.of(word,1L));
}
}
}
}));
return res;
}
}
public class DataPipe {
public static void main(String[] args) {
final String stateId = "myMapState";
final String myState = "myState";
PipelineOptions options = PipelineOptionsFactory.create();
options.as(FlinkPipelineOptions.class).setRunner(FlinkRunner.class);
Pipeline p = Pipeline.create(options);
PCollection<KV<String,Long>> stream1 =
new KafkaWordCount("localhost:9092","idm")
.build(p)
.apply(
Window
.<KV<String,Long>>into(
FixedWindows.of(Duration.millis(3600000)))
.triggering(
Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes());
PCollection<KV<String,Long>> stream2 =
new KafkaWordCount("localhost:9092","assist")
.build(p)
.apply(
Window
.<KV<String,Long>>into(
FixedWindows.of(Duration.millis(3600000)))
.triggering(
Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
.withAllowedLateness(Duration.ZERO)
.discardingFiredPanes());
final TupleTag<Long> web = new TupleTag<Long>();
final TupleTag<Long> assist = new TupleTag<Long>();
PCollection<KV<String, CoGbkResult>> joinedStream =
KeyedPCollectionTuple.of(web, stream1)
.and(assist, stream2)
.apply(CoGroupByKey.<String>create());
PCollection<KV<String,Long>> finalCountStream =
joinedStream
.apply(ParDo.of(
new DoFn<KV<String, CoGbkResult>, KV<String,Long>>() {
@StateId(stateId)
private final StateSpec<MapState<String, Long>> mapState =
StateSpecs.map();
@ProcessElement
public void processElement(
ProcessContext processContext,
@StateId(stateId) MapState<String, Long> state) {
KV<String,CoGbkResult> element = processContext.element();
Iterable<Long> count1 = element.getValue().getAll(web);
Iterable<Long> count2 = element.getValue().getAll(assist);
Long sumAmount =
StreamSupport
.stream(
Iterables
.concat(count1, count2)
.spliterator(),
false)
.collect(Collectors.summingLong(n -> n));
System.out.println(element.getKey()+"::"+sumAmount);
// processContext.output(element.getKey()+"::"+sumAmount);
Long currCount = state.get(element.getKey()).read()==null? 0L:state.get(element.getKey()).read();
Long newCount = currCount+sumAmount;
state.put(element.getKey(),sumAmount);
processContext.output(KV.of(element.getKey(),sumAmount));
}
}));
finalCountStream
.apply(ParDo.of(new DoFn<KV<String,Long>, KV<String,Long>>() {
@ProcessElement
public void processElement(ProcessContext processContext) {
processContext.output(processContext.element());
}
}))
.apply("finalState", ParDo.of(new DoFn<KV<String,Long>, String>() {
@StateId(myState)
private final StateSpec<MapState<String, Long>> mapState =
StateSpecs.map();
@ProcessElement
public void processElement(
ProcessContext c,
@StateId(myState) MapState<String, Long> state){
KV<String,Long> e = c.element();
System.out.println("Thread ID :"
+ Thread.currentThread().getId());
Long currCount =
state.get(e.getKey()).read()==null
? 0L
: state.get(e.getKey()).read();
Long newCount = currCount+e.getValue();
state.put(e.getKey(),newCount);
c.output(e.getKey()+":"+newCount);
}
}))
.apply(KafkaIO.<Void, String>write()
.withBootstrapServers("localhost:9092")
.withTopic("test")
.withValueSerializer(StringSerializer.class)
.values());
/* finalCountStream.apply(KafkaIO.<Void, String>write()
.withBootstrapServers("localhost:9092")
.withTopic("test")
.withValueSerializer(StringSerializer.class)
.values()
);*/
//finalCountStream.apply(TextIO.write().to("wordcounts"));
p.run().waitUntilFinish();
}
}
该Beam管道从两个kafka流中读取文本,将其拆分为单词,并基于单词合并两个流,最后从两个流中向另一个kafka主题发出单词计数
答案 1 :(得分:0)
供参考:如果我对问题的理解正确,则可以使用KafkaIO.withTopics(List<String>)
简化管道的第一部分,并一步步读取两个(或多个)主题。因此,之后无需再加入来自不同主题的数据。