我正在通过Spark结构化的流媒体消费来自kafka的数据,并尝试将其写入3个不同的来源。我希望流按顺序执行,因为stream2(query2)中的逻辑(在编写器中)取决于stream1(query1)。发生的事情是在query1和我的逻辑中断之前执行了query2。
val inputDf = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", brokers)
.option("assign"," {\""+topic+"\":[0]}")
.load()
val query1 = inputDf.selectExpr("CAST (partition AS INT)","CAST (offset AS INT)","CAST (timestamp AS STRING)")
df1.agg(min("offset"), max("offset"))
.writeStream
.foreach(writer)
.outputMode("complete")
.trigger(Trigger.ProcessingTime("2 minutes"))
.option("checkpointLocation", checkpoint_loc1).start()
//result= (derived from some processing over 'inputDf' dataframe)
val query2 = result.select(result("eventdate")).distinct
distDates.writeStream.foreach(writer1)
.trigger(Trigger.ProcessingTime("2 minutes"))
.option("checkpointLocation", checkpoint_loc2).start()
val query3 = result.writeStream
.outputMode("append")
.format("orc")
.partitionBy("eventdate")
.option("path", "/warehouse/test_duplicate/download/data1")
.option("checkpointLocation", checkpoint_loc)
.option("maxRecordsPerFile", 999999999)
.trigger(Trigger.ProcessingTime("2 minutes"))
.start()
spark.streams.awaitAnyTermination()
result.checkpoint()