val spark = SparkSession.builder().appName("app2").getOrCreate()
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "host:port")
.option("subscribe", args(0))
.option("startingOffsets", "earliest")
.load()
val checkPoint = "outputKafka/checkpoint/"
val newDf = df.selectExpr("CAST(value AS STRING)")
val df2 = newDf.withColumn("Primary_Type",split(col("value"),",")(5))
.withColumn("Create_Date",split(col("value"),",")(2))
val pattern = "MM/dd/yyyy hh:mm:ss a"
val finalDf = df2.withColumn("DateNew", unix_timestamp(df2("Create_Date"), pattern)
.cast("timestamp"))
finalDf.withWatermark("DateNew","500 milliseconds")
.groupBy(window(finalDf("DateNew"),"2 minutes"),col("Primary_Type"))
.agg(count("*").as("Crime_Count"))
.select("window.start","window.end","Primary_Type","Crime_Count")
.writeStream.format("csv")
.option("format", "append")
.option("path",args(1)+"kafka.csv")
.option("checkpointLocation",checkPoint)
.option("header", "true")
.outputMode("append")
.trigger(Trigger.ProcessingTime(20.seconds)).start().awaitTermination()
摆脱错误
线程“ main” org.apache.spark.sql.AnalysisException中的异常: 流聚合时不支持追加输出模式 在不带水印的流式DataFrames / DataSet上;;
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker $ .org $ apache $ spark $ sql $ catalyst $ analysis $ UnsupportedOperationChecker $$ throwError(UnsupportedOperationChecker.scala:374) 在org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker $ .checkForStreaming(UnsupportedOperationChecker.scala:110)处 在org.apache.spark.sql.streaming.StreamingQueryManager.createQuery(StreamingQueryManager.scala:235) 在org.apache.spark.sql.streaming.StreamingQueryManager.startQuery(StreamingQueryManager.scala:299) 在org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:296) 在com.mindtree.chicago.Question2 $ .main(Question2.scala:74) 在com.mindtree.chicago.Question2.main(Question2.scala) 在sun.reflect.NativeMethodAccessorImpl.invoke0(本机方法)处 在sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 在sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 在java.lang.reflect.Method.invoke(Method.java:498) 在org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) 在org.apache.spark.deploy.SparkSubmit $ .org $ apache $ spark $ deploy $ SparkSubmit $$ runMain(SparkSubmit.scala:894)中 在org.apache.spark.deploy.SparkSubmit $ .doRunMain $ 1(SparkSubmit.scala:198) 在org.apache.spark.deploy.SparkSubmit $ .submit(SparkSubmit.scala:228) 在org.apache.spark.deploy.SparkSubmit $ .main(SparkSubmit.scala:137) 在org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)