我通过spark job服务器有一个长时间运行的spark上下文。批处理作业定期触发, 在某些情况下,批处理作业因以下堆栈跟踪而异常。 堆栈跟踪没有太多线索可以查看异常开始的位置。
重新启动作业服务器并使用相同的输入运行时,作业运行正常。
从日志中,代码在调用以下方法之前已成功运行。
import org.joda.time.DateTime
import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.rdd.RDD
case class Key(start: DateTime)
//method
val startDate = DateTime.parse("2016-07-04T00:00:00.000+00:00")
val endDate = DateTime.parse("2016-07-05T00:00:00.000+00:00")
val keyspace = "test_keyspace"
val table = "test_table"
val dates = List(startDate)
val keys = dates.map(date => Key(date))
val rdd = sc.parallelize(keys)
.joinWithCassandraTable(keyspace, table)
.where("ts > ?", startDate)
.where("ts <= ?", endDate)
.map(x => x._2)
val ids = trackerRdd.flatMap(x => x.getSet[String]("ids")).distinct.sortBy(x => x).collect().toList
logger.info(s"$ids")
这是堆栈跟踪。这有重复的writeSerialData-&gt; ordinaryObject-&gt; defaultwritefields。
WARN s.j.JobManagerActor [] [] - Exception from job c269030a-615a-4218-97eb-328008e3c667:
java.util.concurrent.ExecutionException: Boxed Error
at scala.concurrent.impl.Promise$.resolver(Promise.scala:55) ~[scala-library-2.10.5.jar:na]
at scala.concurrent.impl.Promise$.scala$concurrent$impl$Promise$$resolveTry(Promise.scala:47) ~[scala-library-2.10.5.jar:na]
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:244) ~[scala-library-2.10.5.jar:na]
at scala.concurrent.Promise$class.complete(Promise.scala:55) ~[scala-library-2.10.5.jar:na]
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) ~[scala-library-2.10.5.jar:na]
at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:23) ~[scala-library-2.10.5.jar:na]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) [na:1.8.0_72]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) [na:1.8.0_72]
at java.lang.Thread.run(Thread.java:745) [na:1.8.0_72]
Caused by: java.lang.StackOverflowError: null
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509) ~[na:1.8.0_72]
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432) ~[na:1.8.0_72]
问题
会收集导致递归呼叫吗?使用相同的输入重新运行作业可以正常工作。 有任何想法来调试吗?
该问题不易再现。它会在运行几天后发生。