我有一个pyspark应用程序,当通过jupyter笔记本调用时,它运行得很好,但是当使用spark-submit
从命令行运行时,我得到了Java堆空间和netty错误。我认为这不会发生,因为我使用相同的spark配置并为两种方法调用相同的代码。
spark提交的代码为:
def main(person_profile_indexed_blocked_loc,
broadcast_matches_output_loc,
not_broadcast_matches_output_loc):
spark = (SparkSession
.builder
.config('spark.sql.autoBroadcastJoinThreshold', '1097152000')
.config('spark.dynamicAllocation.enabled', 'False')
.config('spark.executor.cores', '5')
.config('spark.sql.shuffle.partitions', '2001')
.config('spark.executor.instances', '45')
.config('spark.executor.memory', '20g')
.config('spark.driver.memory', '20g')
.config('spark.driver.maxResultSize', '51200m')
.config('spark.kryoserializer.buffer.max', '512m')
.getOrCreate())
person_profile_indexed_blocked = spark.read.parquet(person_profile_indexed_blocked_loc)
finding_matches = FindMatches(person_profile_indexed_blocked=person_profile_indexed_blocked,
spark=spark,
broadcast_matches_output=broadcast_matches_output_loc,
not_broadcast_matches_output=not_broadcast_matches_output_loc)
finding_matches.run()
if __name__ == '__main__':
person_profile_indexed_blocked_loc = 'location'
broadcast_matches_output_loc = 'location'
not_broadcast_matches_output_loc = 'location'
main(person_profile_indexed_blocked_loc,
broadcast_matches_output_loc,
not_broadcast_matches_output_loc)
当我使用以下代码在jupyter中运行它时,它工作正常:
spark = (SparkSession
.builder
.config('spark.sql.autoBroadcastJoinThreshold', '1097152000')
.config('spark.dynamicAllocation.enabled', 'False')
.config('spark.executor.cores', '5')
.config('spark.sql.shuffle.partitions', '2001')
.config('spark.executor.instances', '45')
.config('spark.executor.memory', '20g')
.config('spark.driver.memory', '20g')
.config('spark.driver.maxResultSize', '51200m')
.config('spark.kryoserializer.buffer.max', '512m')
.getOrCreate())
person_profile_indexed_blocked_loc = 'location'
broadcast_matches_output_loc = 'location'
not_broadcast_matches_output_loc = 'location'
finding_matches = FindMatches(person_profile_indexed_blocked=person_profile_indexed_blocked,
spark=spark,
broadcast_matches_output=broadcast_matches_output_loc,
not_broadcast_matches_output=not_broadcast_matches_output_loc)
finding_matches.run()
以下是错误消息的示例。我不知道为什么它会以一种方式起作用,而不是另一种方式。
ERROR util.Utils: Uncaught exception in thread task-result-getter-0
java.lang.OutOfMemoryError: Java heap space
at org.apache.spark.scheduler.DirectTaskResult$$anonfun$readExternal$1.apply$mcV$sp(TaskResult.scala:57)
at org.apache.spark.scheduler.DirectTaskResult$$anonfun$readExternal$1.apply(TaskResult.scala:55)
at org.apache.spark.scheduler.DirectTaskResult$$anonfun$readExternal$1.apply(TaskResult.scala:55)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1360)
at org.apache.spark.scheduler.DirectTaskResult.readExternal(TaskResult.scala:55)
at java.io.ObjectInputStream.readExternalData(ObjectInputStream.java:2109)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2058)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1567)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:427)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:108)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3$$anonfun$run$1.apply$mcV$sp(TaskResultGetter.scala:91)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3$$anonfun$run$1.apply(TaskResultGetter.scala:63)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3$$anonfun$run$1.apply(TaskResultGetter.scala:63)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2005)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3.run(TaskResultGetter.scala:62)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Exception in thread "task-result-getter-0" java.lang.OutOfMemoryError: Java heap space
at org.apache.spark.scheduler.DirectTaskResult$$anonfun$readExternal$1.apply$mcV$sp(TaskResult.scala:57)
at org.apache.spark.scheduler.DirectTaskResult$$anonfun$readExternal$1.apply(TaskResult.scala:55)
at org.apache.spark.scheduler.DirectTaskResult$$anonfun$readExternal$1.apply(TaskResult.scala:55)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1360)
at org.apache.spark.scheduler.DirectTaskResult.readExternal(TaskResult.scala:55)
at java.io.ObjectInputStream.readExternalData(ObjectInputStream.java:2109)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2058)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1567)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:427)
at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:108)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3$$anonfun$run$1.apply$mcV$sp(TaskResultGetter.scala:91)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3$$anonfun$run$1.apply(TaskResultGetter.scala:63)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3$$anonfun$run$1.apply(TaskResultGetter.scala:63)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2005)
at org.apache.spark.scheduler.TaskResultGetter$$anon$3.run(TaskResultGetter.scala:62)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/08/08 17:13:47 WARN server.TransportChannelHandler: Exception in connection from nf5460u4-dn.travelsky.bdp.com/10.5.188.152:35650
java.lang.OutOfMemoryError: Java heap space
at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
at io.netty.buffer.CompositeByteBuf.nioBuffer(CompositeByteBuf.java:1466)
at io.netty.buffer.AbstractDerivedByteBuf.nioBuffer(AbstractDerivedByteBuf.java:113)
at io.netty.buffer.AbstractByteBuf.nioBuffer(AbstractByteBuf.java:1203)
at org.apache.spark.network.buffer.NettyManagedBuffer.nioByteBuffer(NettyManagedBuffer.java:45)
at org.apache.spark.network.BlockTransferService$$anon$1.onBlockFetchSuccess(BlockTransferService.scala:109)
at org.apache.spark.network.shuffle.RetryingBlockFetcher$RetryingBlockFetchListener.onBlockFetchSuccess(RetryingBlockFetcher.java:204)