我正在尝试编写一个Flink程序来处理Kinesis Stream。 Kinesis流来自AWS DynamoDB流,表示在DynamoDB表中进行的插入。
Stream中的每条记录都可以包含多个插入记录。插入记录的数量可以是可变的(可以在1到10之间变化)
我想在1分钟的时间间隔内对所有流中的所有插入记录进行分组,并总结展示次数(impressionCount)字段
[
{
"country":"NL",
"userOS":"mac",
"createdOn":"2017-08-02 16:22:17.135600",
"trafficType":"D",
"affiliateId":"87",
"placement":"4",
"offerId":"999",
"advertiserId":"139",
"impressionCount":"1",
"uniqueOfferCount":"0"
},
{
"country":"NL",
"userOS":"mac",
"createdOn":"2017-08-02 16:22:17.135600",
"trafficType":"D",
"affiliateId":"85",
"placement":"4",
"offerId":"688",
"advertiserId":"139",
"impressionCount":"1",
"uniqueOfferCount":"0"
}
]
我的代码:
DataStream<List> kinesisStream = env.addSource(new FlinkKinesisConsumer<>(
"Impressions-Stream", new RawImpressionLogSchema(), consumerConfig));
/** CLASS: RawImpressionLogSchema **/
public class RawImpressionLogSchema implements DeserializationSchema<List> {
@Override
public List<RawImpressionLogRecord> deserialize(byte[] bytes) {
return RawImpressionLogRecord.parseImpressionLog(bytes);
}
@Override
public boolean isEndOfStream(List event) {
return false;
}
@Override
public TypeInformation<List> getProducedType() {
return TypeExtractor.getForClass(List.class);
}
}
/** parse Method **/
public static List<RawImpressionLogRecord> parseImpressionLog(
byte[] impressionLogBytes) {
JsonReader jsonReader = new JsonReader(new InputStreamReader(
new ByteArrayInputStream(impressionLogBytes)));
JsonElement jsonElement = Streams.parse(jsonReader);
if (jsonElement == null) {
throw new IllegalArgumentException(
"Event does not define a eventName field: "
+ new String(impressionLogBytes));
} else {
Type listType = new TypeToken<ArrayList<RawImpressionLogRecord>>(){}.getType();
return gson.fromJson(jsonElement, listType);
}
}
我能够解析输入并创建kinesisStream。想知道这是正确的方法吗?以及如何实现聚合。
一旦我拥有了DataStream,我如何在List Stream上应用map / filter / group by。
我是Flink的新手,我们将不胜感激。
尝试使用以下代码来解决上述用例。但不知何故,reduce函数没有被调用。知道下面的代码有什么问题吗?
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<List<ImpressionLogRecord>> rawRecords = env.addSource(new ImpressionLogDataSourceFunction("C:\\LogFiles\\input.txt"));
DataStream<ImpressionLogRecord> impressionLogDataStream = rawRecords
.flatMap(new Splitter())
.assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor<ImpressionLogRecord>(Time.seconds(5)) {
@Override
public long extractTimestamp(
ImpressionLogRecord element) {
return element.getCreatedOn().atZone(ZoneOffset.systemDefault()).toInstant().toEpochMilli();
}
}
);
//impressionLogDataStream.print();
KeyedStream<ImpressionLogRecord, String> keyedImpressionLogDataStream = impressionLogDataStream
.keyBy(impressionLogRecordForKey -> {
StringBuffer groupByKey = new StringBuffer();
groupByKey.append(impressionLogRecordForKey.getCreatedOn().toString().substring(0, 16));
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getOfferId());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getAdvertiserId());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getAffiliateId());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getCountry());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getPlacement());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getTrafficType());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getUserOS());
System.out.println("Call to Group By Function===================" + groupByKey);
return groupByKey.toString();
});
//keyedImpressionLogDataStream.print();
DataStream<ImpressionLogRecord> aggImpressionRecord = keyedImpressionLogDataStream
.timeWindow(Time.minutes(5))
.reduce((prevLogRecord, currentLogRecord) -> {
System.out.println("Calling Reduce Function-------------------------");
ImpressionLogRecord aggregatedImpressionLog = new ImpressionLogRecord();
aggregatedImpressionLog.setOfferId(prevLogRecord.getOfferId());
aggregatedImpressionLog.setCreatedOn(prevLogRecord.getCreatedOn().truncatedTo(ChronoUnit.MINUTES));
aggregatedImpressionLog.setAdvertiserId(prevLogRecord.getAdvertiserId());
aggregatedImpressionLog.setAffiliateId(prevLogRecord.getAffiliateId());
aggregatedImpressionLog.setCountry(prevLogRecord.getCountry());
aggregatedImpressionLog.setPlacement(prevLogRecord.getPlacement());
aggregatedImpressionLog.setTrafficType(prevLogRecord.getTrafficType());
aggregatedImpressionLog.setUserOS(prevLogRecord.getUserOS());
aggregatedImpressionLog.setImpressionCount(prevLogRecord.getImpressionCount() + currentLogRecord.getImpressionCount());
aggregatedImpressionLog.setUniqueOfferCount(prevLogRecord.getUniqueOfferCount() + currentLogRecord.getUniqueOfferCount());
return aggregatedImpressionLog;
});
aggImpressionRecord.print();
答案 0 :(得分:1)
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<List<ImpressionLogRecord>> rawRecords = env.addSource(new ImpressionLogDataSourceFunction("C:\\LogFiles\\input.txt"));
//This method converts the DataStream of List<ImpressionLogRecords> into a single stream of ImpressionLogRecords.
//Also assigns timestamp to each record in the stream
DataStream<ImpressionLogRecord> impressionLogDataStream = rawRecords
.flatMap(new RecordSplitter())
.assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor<ImpressionLogRecord>(Time.seconds(5)) {
@Override
public long extractTimestamp(
ImpressionLogRecord element) {
return element.getCreatedOn().atZone(ZoneOffset.systemDefault()).toInstant().toEpochMilli();
}
}
);
//This method groups the records in the stream by a user defined key.
KeyedStream<ImpressionLogRecord, String> keyedImpressionLogDataStream = impressionLogDataStream
.keyBy(impressionLogRecordForKey -> {
StringBuffer groupByKey = new StringBuffer();
groupByKey.append(impressionLogRecordForKey.getCreatedOn().toString().substring(0, 16));
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getOfferId());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getAdvertiserId());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getAffiliateId());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getCountry());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getPlacement());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getTrafficType());
groupByKey.append("_");
groupByKey.append(impressionLogRecordForKey.getUserOS());
return groupByKey.toString();
});
//This method aggregates the grouped records every 1 min and calculates the sum of impression count and unique offer count.
DataStream<ImpressionLogRecord> aggImpressionRecord = keyedImpressionLogDataStream
.timeWindow(Time.minutes(1))
.reduce((prevLogRecord, currentLogRecord) -> {
ImpressionLogRecord aggregatedImpressionLog = new ImpressionLogRecord();
aggregatedImpressionLog.setOfferId(prevLogRecord.getOfferId());
aggregatedImpressionLog.setCreatedOn(prevLogRecord.getCreatedOn().truncatedTo(ChronoUnit.MINUTES));
aggregatedImpressionLog.setAdvertiserId(prevLogRecord.getAdvertiserId());
aggregatedImpressionLog.setAffiliateId(prevLogRecord.getAffiliateId());
aggregatedImpressionLog.setCountry(prevLogRecord.getCountry());
aggregatedImpressionLog.setPlacement(prevLogRecord.getPlacement());
aggregatedImpressionLog.setTrafficType(prevLogRecord.getTrafficType());
aggregatedImpressionLog.setUserOS(prevLogRecord.getUserOS());
aggregatedImpressionLog.setImpressionCount(prevLogRecord.getImpressionCount() + currentLogRecord.getImpressionCount());
aggregatedImpressionLog.setUniqueOfferCount(prevLogRecord.getUniqueOfferCount() + currentLogRecord.getUniqueOfferCount());
return aggregatedImpressionLog;
});
aggImpressionRecord.print();
aggImpressionRecord.addSink(new ImpressionLogDataSink());
env.execute();
}
public static class RecordSplitter
implements
FlatMapFunction<List<ImpressionLogRecord>, ImpressionLogRecord> {
@Override
public void flatMap(List<ImpressionLogRecord> rawImpressionRecords,
Collector<ImpressionLogRecord> impressionLogRecordCollector)
throws Exception {
for (int i = 0; i < rawImpressionRecords.size(); i++) {
impressionLogRecordCollector.collect(rawImpressionRecords.get(i));
}
}
}`enter code here`