StructuredStreaming流联接超过2

时间:2019-05-22 08:07:48

标签: java join stream

java.lang.NullPointerException at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown Source)at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec$OneSideHashJoiner$$anonfun$26.apply(StreamingSymmetricHashJoinExec.scala:412)
    at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec$OneSideHashJoiner$$anonfun$26.apply(StreamingSymmetricHashJoinExec.scala:412)
    at org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager$$anon$2.findNextValueForIndex(SymmetricHashJoinStateManager.scala:197)
    at org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager$$anon$2.getNext(SymmetricHashJoinStateManager.scala:221)
    at org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager$$anon$2.getNext(SymmetricHashJoinStateManager.scala:157)
    at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
    at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec$$anonfun$org$apache$spark$sql$execution$streaming$StreamingSymmetricHashJoinExec$$onOutputCompletion$1$1.apply$mcV$sp(StreamingSymmetricHashJoinExec.scala:338)
    at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec$$anonfun$org$apache$spark$sql$execution$streaming$StreamingSymmetricHashJoinExec$$onOutputCompletion$1$1.apply(StreamingSymmetricHashJoinExec.scala:323)
    at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec$$anonfun$org$apache$spark$sql$execution$streaming$StreamingSymmetricHashJoinExec$$onOutputCompletion$1$1.apply(StreamingSymmetricHashJoinExec.scala:323)
    at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:535)
    at org.apache.spark.sql.execution.streaming.StateStoreWriter$class.timeTakenMs(statefulOperators.scala:108)
    at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec.timeTakenMs(StreamingSymmetricHashJoinExec.scala:126)
    at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec.org$apache$spark$sql$execution$streaming$StreamingSymmetricHashJoinExec$$onOutputCompletion$1(StreamingSymmetricHashJoinExec.scala:323) at org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec$$anonfun$org$apache$spark$sql$execution$streaming$StreamingSymmetricHashJoinExec$$processPartitions$1.apply$mcV$sp(StreamingSymmetricHashJoinExec.scala:361)at org.apache.spark.util.CompletionIterator$$anon$1.completion(CompletionIterator.scala:47)at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:36)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:216)
    at org.apache.spark.sql.execution.SortExec$$anonfun$1.apply(SortExec.scala:108)
    at org.apache.spark.sql.execution.SortExec$$anonfun$1.apply(SortExec.scala:101)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

我尝试左外连接四个流表的批处理间隔为5分钟,处理1小时后,程序抛出nullpointexception。我的保存格式是结构化的流增量。

 **    mt_rovk_df = createFullDf(mt_rovk_df,MT_RVOK).select(col("date"),col("USERID"),col("ID"),col("RECVMTTIME"),col("PTMSGID"),col("SPGATEBIND"),col("ORDERCPNO"),col("timestamp")).withWatermark("timestamp","2 minute").dropDuplicates("PTMSGID", "timestamp")
    rpt_rovk_df = createFullDf(rpt_rovk_df,RPT_RVOK).select(col("PTMSGID") as "RPT_ROVK_PTMSGID",col("RECVTIME"),col("ERRORCODE"),col("timestamp") as "RPT_ROVK_TIMESTAMP").withWatermark("RPT_ROVK_TIMESTAMP","2 minute").dropDuplicates("RPT_ROVK_PTMSGID", "RPT_ROVK_TIMESTAMP")
    rpt_sdok_df =  createFullDf(rpt_sdok_df,RPT_SDOK).select(col("PTMSGID") as "RPT_SDOK_PTMSGID",col("SENDFLAG"),col("timestamp") as "RPT_SDOK_TIMESTAMP")
.withWatermark("RPT_SDOK_TIMESTAMP","2minute").dropDuplicates("RPT_SDOK_PTMSGID", "RPT_SDOK_TIMESTAMP")
    mt_sdok_df =  createFullDf(mt_sdok_df,MT_SDOK).select(col("PTMSGID") as "MT_SDOK_PTMSGID",col("SENDTIME"),col("SPGATESEND"),col("SENDRESULT"),col("timestamp") as "MT_SDOK_TIMESTAMP").withWatermark("MT_SDOK_TIMESTAMP","2 minute").dropDuplicates("MT_SDOK_PTMSGID", "MT_SDOK_TIMESTAMP")

    val batchDF1 = joinDf.coalesce(coalesceNum.toInt * 30).writeStream
.trigger(Trigger.ProcessingTime(batchInterval.toInt * 5,TimeUnit.SECONDS))
.queryName("SparkSave1").partitionBy("date").outputMode(OutputMode.Append)
.option("checkpointLocation","hdfs://ns1/checkpoint2").format("delta").option("path","hdfs://ns1/delta/").start
batchDF1.awaitTermination()**

如何解决?

0 个答案:

没有答案