我正在运行一个spark应用程序,该应用程序崩溃,但出现以下异常。是什么原因引起的?
19/08/14 18:30:20 ERROR org.apache.spark.executor.Executor- Exception in task 11.3 in stage 17.0 (TID 1651)
org.apache.spark.util.TaskCompletionListenerException: Exception 0: Cannot run program "chmod": error=11, Resource temporarily unavailable
Exception 1: Cannot run program "chmod": error=11, Resource temporarily unavailable
Previous exception in task: Error committing version 9 into HDFSStateStore[id=(op=0,part=11),dir=file:/usr/local/spark/checkpoint/Speed-Metrics/state/0/11]
org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider$HDFSBackedStateStore.commit(HDFSBackedStateStoreProvider.scala:138)
org.apache.spark.sql.execution.streaming.state.StreamingAggregationStateManagerBaseImpl.commit(StreamingAggregationStateManager.scala:89)
org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anonfun$doExecute$3$$anon$2$$anonfun$close$3.apply$mcV$sp(statefulOperators.scala:398)
org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anonfun$doExecute$3$$anon$2$$anonfun$close$3.apply(statefulOperators.scala:398)
org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anonfun$doExecute$3$$anon$2$$anonfun$close$3.apply(statefulOperators.scala:398)
org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:535)
org.apache.spark.sql.execution.streaming.StateStoreWriter$class.timeTakenMs(statefulOperators.scala:108)
org.apache.spark.sql.execution.streaming.StateStoreSaveExec.timeTakenMs(statefulOperators.scala:277)
org.apache.spark.sql.execution.streaming.StateStoreSaveExec$$anonfun$doExecute$3$$anon$2.close(statefulOperators.scala:398)
org.apache.spark.util.NextIterator.closeIfNeeded(NextIterator.scala:66)
org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:75)
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.sort_addToSorter_0$(Unknown Source)
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
org.apache.spark.sql.execution.aggregate.SortAggregateExec$$anonfun$doExecute$1$$anonfun$3.apply(SortAggregateExec.scala:80)
org.apache.spark.sql.execution.aggregate.SortAggregateExec$$anonfun$doExecute$1$$anonfun$3.apply(SortAggregateExec.scala:77)
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$12.apply(RDD.scala:823)
org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndexInternal$1$$anonfun$12.apply(RDD.scala:823)
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
org.apache.spark.scheduler.Task.run(Task.scala:121)
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
java.lang.Thread.run(Thread.java:748)
at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:138)
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:116)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/08/14 18:30:20 ERROR org.apache.spark.util.SparkUncaughtExceptionHandler- Uncaught exception in thread Thread[CoarseGrainedExecutorBackend-stop-executor,5,main]
java.lang.OutOfMemoryError: unable to create new native thread
at java.lang.Thread.start0(Native Method)
at java.lang.Thread.start(Thread.java:717)
at io.netty.util.concurrent.ThreadPerTaskExecutor.execute(ThreadPerTaskExecutor.java:33)
at io.netty.util.concurrent.SingleThreadEventExecutor.doStartThread(SingleThreadEventExecutor.java:847)
at io.netty.util.concurrent.SingleThreadEventExecutor.shutdownGracefully(SingleThreadEventExecutor.java:585)
at io.netty.util.concurrent.MultithreadEventExecutorGroup.shutdownGracefully(MultithreadEventExecutorGroup.java:163)
at io.netty.util.concurrent.AbstractEventExecutorGroup.shutdownGracefully(AbstractEventExecutorGroup.java:70)
at org.apache.spark.network.client.TransportClientFactory.close(TransportClientFactory.java:289)
at org.apache.spark.network.netty.NettyBlockTransferService.close(NettyBlockTransferService.scala:183)
at org.apache.spark.storage.BlockManager.stop(BlockManager.scala:1615)
at org.apache.spark.SparkEnv.stop(SparkEnv.scala:90)
at org.apache.spark.executor.Executor.stop(Executor.scala:264)
at org.apache.spark.executor.CoarseGrainedExecutorBackend$$anonfun$receive$1$$anon$1.run(CoarseGrainedExecutorBackend.scala:123)
Spark版本:2.4.3 Spark独立集群,具有1个主控和1个工作器。 主服务器和工作服务器都具有足够的内存。 与内存相关的变量会在spark-env.sh文件中正确更新。
我检查了ulimit,它是无限的。 最大线程数也很高。
我注意到很多线程进入睡眠状态。 这导致无法创建新的本机线程的异常。
我在线程转储中看到很多线程在等待。 java.lang.Thread.State:正在等待
内存正常,ulimit进程正常。我被困在如何进一步调试上。
可能是什么问题以及如何解决?