在我的用例中,我需要维护密钥的历史数据。为此,我使用updateStateByKey,其中状态被维护为可变的scala集合(ArrayBuffer)。 ArrayBuffer中的每个元素都是传入记录。 Spark版本是1.6
随着一个键的ArrayBuffer中元素(记录)的数量增加,我得到了StackOverflow错误。
18/03/28 07:31:55 ERROR scheduler.JobScheduler: Error running job streaming job 1459150304000 ms.2
java.lang.StackOverflowError
at scala.collection.immutable.StringOps.stripSuffix(StringOps.scala:31)
at org.apache.spark.Logging$class.logName(Logging.scala:44)
at org.apache.spark.rdd.RDD.logName(RDD.scala:74)
at org.apache.spark.Logging$class.log(Logging.scala:51)
at org.apache.spark.rdd.RDD.log(RDD.scala:74)
at org.apache.spark.Logging$class.logDebug(Logging.scala:62)
at org.apache.spark.rdd.RDD.logDebug(RDD.scala:74)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getDependencies$1.apply(CoGroupedRDD.scala:104)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getDependencies$1.apply(CoGroupedRDD.scala:99)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.rdd.CoGroupedRDD.getDependencies(CoGroupedRDD.scala:99)
at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:226)
at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:224)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.dependencies(RDD.scala:224)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getPartitions$1$$anonfun$apply$mcVI$sp$1.apply(CoGroupedRDD.scala:117)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getPartitions$1$$anonfun$apply$mcVI$sp$1.apply(CoGroupedRDD.scala:115)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getPartitions$1.apply$mcVI$sp(CoGroupedRDD.scala:115)
at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:141)
at org.apache.spark.rdd.CoGroupedRDD.getPartitions(CoGroupedRDD.scala:113)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getPartitions$1$$anonfun$apply$mcVI$sp$1.apply(CoGroupedRDD.scala:121)
at org.apache.spark.rdd.CoGroupedRDD$$anonfun$getPartitions$1$$anonfun$apply$mcVI$sp$1.apply(CoGroupedRDD.scala:115)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
以下是代码段
def updateState(rows: Seq[ArrayBuffer[Row]], state: Option[ArrayBuffer[Row]]) = {
val prevState = state.getOrElse[ArrayBuffer[Row]](ArrayBuffer[Row]())
val newState = ArrayBuffer.empty[Row]
newState ++= prevState
for (r <- rows) {
newState += r(0)
}
Some(newState)
}
val pairedFaultStream = getPairedStream(faultStream, sqlContext)
val workingStream = pairedFaultStream.updateStateByKey[ArrayBuffer[Row]](updateState _).map(_._2)
我尝试了以下方法
任何建议将不胜感激。