在Flink中使用多个Kafka经纪人的多个主题

时间:2019-09-10 13:37:52

标签: scala apache-kafka apache-flink

我将flink-1.4.2与scala一起使用,并且我想使用Kafka的多个数据流源。我已经使用了合并它们的联合功能,但是我可以使用一个kafka源。

def main(args: Array[String]) {
    val kProps = new Properties()
    kProps.setProperty("bootstrap.servers", "kafka01.prod.com:9092")
    kProps.setProperty("group.id", "test_cg")
    kProps.setProperty("enable.auto.commit", "true")
    kProps.setProperty("auto.offset.reset", "latest")
    val kProps2 = new Properties()
    kProps2.setProperty("bootstrap.servers", "kafka04.prod.com:9092")
    kProps2.setProperty("group.id", "test_cg_2")
    kProps2.setProperty("enable.auto.commit", "true")
    kProps2.setProperty("auto.offset.reset", "latest")
    val sink = new BucketingSink[SimpleKafkaOutputMsg]("s3://some-bucket/")
    sink.setBucketer(new DateTimeBucketer[SimpleKafkaOutputMsg]("yyyy-MM-dd-HH"))
    sink.setWriter(new StringWriter[SimpleKafkaOutputMsg])
    sink.setBatchSize(350 * 1024 * 1024) // 350 MB
    sink.setPendingPrefix("file-")
    sink.setPendingSuffix(".csv")

    val env = StreamExecutionEnvironment.getExecutionEnvironment

    env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
    env.setParallelism(9)
    env.setStateBackend(new RocksDBStateBackend("file:///tmp/flink/checkpoints", false))
    val topics = List("ROUNDTRIP1")
    val inpStream1 = env.addSource(new FlinkKafkaConsumer011(topics.asJava, new IndexedSectorMessagDes(), kProps))

    val topics2 = List("ROUNDTRIP2")
    val inpStream2 = env.addSource(new FlinkKafkaConsumer011(topics2.asJava, new IndexedSectorMessagDes(), kProps2))

    val inpStream = inpStream1.union(inpStream2)
    .filter(new InvalidFlightsFilterFunction())
    .map(attachUID(_))
    .assignTimestampsAndWatermarks(new IngestionTimeExtractor[IndexedSectorMessage]())
    val intStream = inpStream.flatMap { s => flattenFlights(s) }
    intStream.keyBy(getFlightKey _).process(new KeyedWindowTimeMedianFunction()).addSink(sink)
    env.execute("Scala WindowExample Example")`

0 个答案:

没有答案