我有一个火花流作业,可以从多个kafka主题读取数据。
现在,我想聚合多个window intervals
上的数据并将其保存到数据库中。
可以这样做吗?否则,我将需要一个单独的spark作业来进行另一级别的聚合。
如果第二次处理数据集会丢失数据吗?
private void buildStream1(Dataset<Row> sourceDataset) {
Dataset<Row> query = sourceDataset
.withWatermark("timestamp", "120 seconds")
.select(
col("timestamp"),
col("datacenter"),
col("platform")
)
.groupBy(
functions.window(col("timestamp"), 120 seconds, 60 seconds).as("timestamp"),
col("datacenter"),
col("platform")
)
.agg(
count(lit(1)).as("count")
);
startKafkaStream(query);
}
private void buildStream1(Dataset<Row> sourceDataset) {
Dataset<Row> query = sourceDataset
.withWatermark("timestamp", "10 minutes")
.select(
col("timestamp"),
col("datacenter"),
col("platform")
)
.groupBy(
functions.window(col("timestamp"), 10 minutes, 5 minutes).as("timestamp"),
col("datacenter"),
col("platform")
)
.agg(
count(lit(1)).as("count")
);
startKafkaStream(query);
}
private void startKafkaStream(Dataset<Row> aggregatedDataset) {
aggregatedDataset
.select(to_json(struct("*")).as("value"))
.writeStream()
.outputMode(OutputMode.Append())
.option("truncate", false)
.format("console")
.trigger(Trigger.ProcessingTime(10 minutes))
.start();
}