我是一个火花流媒体工作。它分10分钟运行。驱动程序机器是m4X4x(64GB)ec2实例。
这项工作在18个小时后停滞不前。它崩溃了以下异常。当我阅读其他帖子时,似乎驱动程序可能已耗尽内存。我怎么检查这个?我的pyspark配置如下
另外,我如何检查spark-ui中的内存?我只看到我拥有的11个任务节点,而不是驱动程序。
export PYSPARK_SUBMIT_ARGS='--master yarn --deploy-mode client
--driver-memory 10g
--executor-memory 10g
--executor-cores 4
--conf spark.driver.cores=5
--packages "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2"
--conf spark.driver.maxResultSize=2g
--conf spark.shuffle.spill=true
--conf spark.yarn.driver.memoryOverhead=2048
--conf spark.yarn.executor.memoryOverhead=2048
--conf "spark.broadcast.blockSize=512M"
--conf "spark.memory.storageFraction=0.5"
--conf "spark.kryoserializer.buffer.max=1024"
--conf "spark.default.parallelism=600"
--conf "spark.sql.shuffle.partitions=600"
--driver-java-options - Dlog4j.configuration=file:///usr/lib/spark/conf/log4j.properties pyspark-shell'
[Stage 3507:> (0 + 0) / 600]Exception in thread "dag-scheduler-event-loop" java.lang.OutOfMemoryError: Java heap space
at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$4.apply(TorrentBroadcast.scala:231)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$4.apply(TorrentBroadcast.scala:231)
at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:205)
at net.jpountz.lz4.LZ4BlockOutputStream.finish(LZ4BlockOutputStream.java:235)
at net.jpountz.lz4.LZ4BlockOutputStream.close(LZ4BlockOutputStream.java:175)
at java.io.ObjectOutputStream$BlockDataOutputStream.close(ObjectOutputStream.java:1828)
at java.io.ObjectOutputStream.close(ObjectOutputStream.java:742)
at org.apache.spark.serializer.JavaSerializationStream.close(JavaSerializer.scala:57)
at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$blockifyObject$1.apply$mcV$sp(TorrentBroadcast.scala:238)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1319)
at org.apache.spark.broadcast.TorrentBroadcast$.blockifyObject(TorrentBroadcast.scala:237)
at org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:107)
at org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:86)
at org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34)
at org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:56)
at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1387)
at org.apache.spark.scheduler.DAGScheduler.submitMissingTasks(DAGScheduler.scala:1012)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:933)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$org$apache$spark$scheduler$DAGScheduler$$submitStage$4.apply(DAGScheduler.scala:936)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$org$apache$spark$scheduler$DAGScheduler$$submitStage$4.apply(DAGScheduler.scala:935)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:935)
at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:873)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1630)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
[Stage 3507:> (0 + 0) / 600]18/02/23 12:59:33 ERROR TransportRequestHandler: Error sending result RpcResponse{requestId=8388437576763608177, body=NioManagedBuffer{buf=java.nio.HeapByteBuffer[pos=0 lim=81 cap=156]}} to /172.23.56.231:58822; closing connection