每1分钟从Kinesis Stream汇总记录

时间:2017-08-05 21:08:38

标签: amazon-dynamodb apache-flink amazon-kinesis flink-streaming

我正在尝试编写一个Flink程序来处理Kinesis Stream。 Kinesis流来自AWS DynamoDB流,表示在DynamoDB表中进行的插入。

Stream中的每条记录都可以包含多个插入记录。插入记录的数量可以是可变的(可以在1到10之间变化)

我想在1分钟的时间间隔内对所有流中的所有插入记录进行分组,并总结展示次数(impressionCount)字段

[
    {
        "country":"NL",
        "userOS":"mac",
        "createdOn":"2017-08-02 16:22:17.135600",
        "trafficType":"D",
        "affiliateId":"87",
        "placement":"4",
        "offerId":"999",
        "advertiserId":"139",
        "impressionCount":"1",
        "uniqueOfferCount":"0"
    },
    {
        "country":"NL",
        "userOS":"mac",
        "createdOn":"2017-08-02 16:22:17.135600",
        "trafficType":"D",
        "affiliateId":"85",
        "placement":"4",
        "offerId":"688",
        "advertiserId":"139",
        "impressionCount":"1",
        "uniqueOfferCount":"0"
    }
]

我的代码:

DataStream<List> kinesisStream = env.addSource(new FlinkKinesisConsumer<>(
          "Impressions-Stream", new RawImpressionLogSchema(), consumerConfig));

/** CLASS: RawImpressionLogSchema **/
public class RawImpressionLogSchema implements DeserializationSchema<List> {

    @Override
    public List<RawImpressionLogRecord> deserialize(byte[] bytes) {
        return RawImpressionLogRecord.parseImpressionLog(bytes);
    }

    @Override
    public boolean isEndOfStream(List event) {
        return false;
    }

    @Override
    public TypeInformation<List> getProducedType() {
        return TypeExtractor.getForClass(List.class);
    }

}

/** parse Method **/      
public static List<RawImpressionLogRecord> parseImpressionLog(
        byte[] impressionLogBytes) {

    JsonReader jsonReader = new JsonReader(new InputStreamReader(
            new ByteArrayInputStream(impressionLogBytes)));

    JsonElement jsonElement = Streams.parse(jsonReader);

    if (jsonElement == null) {
        throw new IllegalArgumentException(
                "Event does not define a eventName field: "
                        + new String(impressionLogBytes));
    } else {
        Type listType = new TypeToken<ArrayList<RawImpressionLogRecord>>(){}.getType();
        return gson.fromJson(jsonElement, listType);
    }

}

我能够解析输入并创建kinesisStream。想知道这是正确的方法吗?以及如何实现聚合。

一旦我拥有了DataStream,我如何在List Stream上应用map / filter / group by。

我是Flink的新手,我们将不胜感激。

更新

尝试使用以下代码来解决上述用例。但不知何故,reduce函数没有被调用。知道下面的代码有什么问题吗?

StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

DataStream<List<ImpressionLogRecord>> rawRecords = env.addSource(new ImpressionLogDataSourceFunction("C:\\LogFiles\\input.txt"));

DataStream<ImpressionLogRecord> impressionLogDataStream = rawRecords
        .flatMap(new Splitter())
        .assignTimestampsAndWatermarks(
                new BoundedOutOfOrdernessTimestampExtractor<ImpressionLogRecord>(Time.seconds(5)) {

                    @Override
                    public long extractTimestamp(
                            ImpressionLogRecord element) {
                        return element.getCreatedOn().atZone(ZoneOffset.systemDefault()).toInstant().toEpochMilli();
                    }
                }
        );

//impressionLogDataStream.print();

KeyedStream<ImpressionLogRecord, String> keyedImpressionLogDataStream = impressionLogDataStream

            .keyBy(impressionLogRecordForKey -> {
                StringBuffer groupByKey = new StringBuffer();
                groupByKey.append(impressionLogRecordForKey.getCreatedOn().toString().substring(0, 16));
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getOfferId());
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getAdvertiserId());
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getAffiliateId());
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getCountry());
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getPlacement());
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getTrafficType());
                groupByKey.append("_");
                groupByKey.append(impressionLogRecordForKey.getUserOS());
                System.out.println("Call to Group By Function===================" + groupByKey);
                return groupByKey.toString();
            });

//keyedImpressionLogDataStream.print();

DataStream<ImpressionLogRecord> aggImpressionRecord = keyedImpressionLogDataStream

        .timeWindow(Time.minutes(5))
        .reduce((prevLogRecord, currentLogRecord) -> {

                System.out.println("Calling Reduce Function-------------------------");
                ImpressionLogRecord aggregatedImpressionLog = new ImpressionLogRecord();
                aggregatedImpressionLog.setOfferId(prevLogRecord.getOfferId());
                aggregatedImpressionLog.setCreatedOn(prevLogRecord.getCreatedOn().truncatedTo(ChronoUnit.MINUTES));
                aggregatedImpressionLog.setAdvertiserId(prevLogRecord.getAdvertiserId());
                aggregatedImpressionLog.setAffiliateId(prevLogRecord.getAffiliateId());
                aggregatedImpressionLog.setCountry(prevLogRecord.getCountry());
                aggregatedImpressionLog.setPlacement(prevLogRecord.getPlacement());
                aggregatedImpressionLog.setTrafficType(prevLogRecord.getTrafficType());
                aggregatedImpressionLog.setUserOS(prevLogRecord.getUserOS());
                aggregatedImpressionLog.setImpressionCount(prevLogRecord.getImpressionCount() + currentLogRecord.getImpressionCount());
                aggregatedImpressionLog.setUniqueOfferCount(prevLogRecord.getUniqueOfferCount() + currentLogRecord.getUniqueOfferCount());

                return aggregatedImpressionLog;
            });

aggImpressionRecord.print();

1 个答案:

答案 0 :(得分:1)

工作代码

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

    DataStream<List<ImpressionLogRecord>> rawRecords = env.addSource(new ImpressionLogDataSourceFunction("C:\\LogFiles\\input.txt"));

    //This method converts the DataStream of List<ImpressionLogRecords> into a single stream of ImpressionLogRecords.
    //Also assigns timestamp to each record in the stream
    DataStream<ImpressionLogRecord> impressionLogDataStream = rawRecords
            .flatMap(new RecordSplitter())
            .assignTimestampsAndWatermarks(
                    new BoundedOutOfOrdernessTimestampExtractor<ImpressionLogRecord>(Time.seconds(5)) {
                        @Override
                        public long extractTimestamp(
                                ImpressionLogRecord element) {
                            return element.getCreatedOn().atZone(ZoneOffset.systemDefault()).toInstant().toEpochMilli();
                        }
                    }
            );

    //This method groups the records in the stream by a user defined key.
    KeyedStream<ImpressionLogRecord, String> keyedImpressionLogDataStream = impressionLogDataStream

                .keyBy(impressionLogRecordForKey -> {
                    StringBuffer groupByKey = new StringBuffer();
                    groupByKey.append(impressionLogRecordForKey.getCreatedOn().toString().substring(0, 16));
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getOfferId());
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getAdvertiserId());
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getAffiliateId());
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getCountry());
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getPlacement());
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getTrafficType());
                    groupByKey.append("_");
                    groupByKey.append(impressionLogRecordForKey.getUserOS());
                    return groupByKey.toString();
                });

    //This method aggregates the grouped records every 1 min and calculates the sum of impression count and unique offer count.
    DataStream<ImpressionLogRecord> aggImpressionRecord = keyedImpressionLogDataStream

            .timeWindow(Time.minutes(1))
            .reduce((prevLogRecord, currentLogRecord) -> {
                    ImpressionLogRecord aggregatedImpressionLog = new ImpressionLogRecord();
                    aggregatedImpressionLog.setOfferId(prevLogRecord.getOfferId());
                    aggregatedImpressionLog.setCreatedOn(prevLogRecord.getCreatedOn().truncatedTo(ChronoUnit.MINUTES));
                    aggregatedImpressionLog.setAdvertiserId(prevLogRecord.getAdvertiserId());
                    aggregatedImpressionLog.setAffiliateId(prevLogRecord.getAffiliateId());
                    aggregatedImpressionLog.setCountry(prevLogRecord.getCountry());
                    aggregatedImpressionLog.setPlacement(prevLogRecord.getPlacement());
                    aggregatedImpressionLog.setTrafficType(prevLogRecord.getTrafficType());
                    aggregatedImpressionLog.setUserOS(prevLogRecord.getUserOS());
                    aggregatedImpressionLog.setImpressionCount(prevLogRecord.getImpressionCount() + currentLogRecord.getImpressionCount());
                    aggregatedImpressionLog.setUniqueOfferCount(prevLogRecord.getUniqueOfferCount() + currentLogRecord.getUniqueOfferCount());
                    return aggregatedImpressionLog;
                });

    aggImpressionRecord.print();

    aggImpressionRecord.addSink(new ImpressionLogDataSink());


    env.execute();

}

public static class RecordSplitter
        implements
            FlatMapFunction<List<ImpressionLogRecord>, ImpressionLogRecord> {
    @Override
    public void flatMap(List<ImpressionLogRecord> rawImpressionRecords,
            Collector<ImpressionLogRecord> impressionLogRecordCollector)
            throws Exception {
        for (int i = 0; i < rawImpressionRecords.size(); i++) {
            impressionLogRecordCollector.collect(rawImpressionRecords.get(i));
        }

    }
}`enter code here`