我正在运行一个应该将DataFrame写入集合的spark作业。它运行了大约一个小时然后突然停止,但有以下异常:
Exception in thread "main" org.apache.spark.SparkException:
Job aborted due to stage failure: Task 20273 in stage 29.0 failed 1 times, most recent failure:
Lost task 20273.0 in stage 29.0 (TID 21088, localhost, executor driver):
com.mongodb.MongoSocketReadException: Prematurely reached end of stream
at com.mongodb.connection.SocketStream.read(SocketStream.java:88)
at com.mongodb.connection.InternalStreamConnection.receiveResponseBuffers(InternalStreamConnection.java:494)
at com.mongodb.connection.InternalStreamConnection.receiveMessage(InternalStreamConnection.java:224)
at com.mongodb.connection.UsageTrackingInternalConnection.receiveMessage(UsageTrackingInternalConnection.java:96)
at com.mongodb.connection.DefaultConnectionPool$PooledConnection.receiveMessage(DefaultConnectionPool.java:440)
at com.mongodb.connection.WriteCommandProtocol.receiveMessage(WriteCommandProtocol.java:262)
at com.mongodb.connection.WriteCommandProtocol.execute(WriteCommandProtocol.java:104)
at com.mongodb.connection.InsertCommandProtocol.execute(InsertCommandProtocol.java:67)
at com.mongodb.connection.InsertCommandProtocol.execute(InsertCommandProtocol.java:37)
at com.mongodb.connection.DefaultServer$DefaultServerProtocolExecutor.execute(DefaultServer.java:168)
at com.mongodb.connection.DefaultServerConnection.executeProtocol(DefaultServerConnection.java:289)
at com.mongodb.connection.DefaultServerConnection.insertCommand(DefaultServerConnection.java:118)
at com.mongodb.operation.MixedBulkWriteOperation$Run$2.executeWriteCommandProtocol(MixedBulkWriteOperation.java:465)
at com.mongodb.operation.MixedBulkWriteOperation$Run$RunExecutor.execute(MixedBulkWriteOperation.java:656)
at com.mongodb.operation.MixedBulkWriteOperation$Run.execute(MixedBulkWriteOperation.java:411)
at com.mongodb.operation.MixedBulkWriteOperation$1.call(MixedBulkWriteOperation.java:177)
at com.mongodb.operation.MixedBulkWriteOperation$1.call(MixedBulkWriteOperation.java:168)
at com.mongodb.operation.OperationHelper.withConnectionSource(OperationHelper.java:422)
at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:413)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:168)
at com.mongodb.operation.MixedBulkWriteOperation.execute(MixedBulkWriteOperation.java:74)
at com.mongodb.Mongo.execute(Mongo.java:845)
at com.mongodb.Mongo$2.execute(Mongo.java:828)
at com.mongodb.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:338)
at com.mongodb.MongoCollectionImpl.insertMany(MongoCollectionImpl.java:322)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1$$anonfun$apply$2.apply(MongoSpark.scala:119)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:119)
at com.mongodb.spark.MongoSpark$$anonfun$save$1$$anonfun$apply$1.apply(MongoSpark.scala:118)
at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:186)
at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:184)
at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:184)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:118)
at com.mongodb.spark.MongoSpark$$anonfun$save$1.apply(MongoSpark.scala:117)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
只有部分数据被写入MongoDB。我将Scala与SparkMongoDB连接器一起使用。我该如何解决这个问题?
修改
以下是Spark会话的设置代码:
val spark: SparkSession = SparkSession.builder()
.appName("Spark Movie Similarities")
.master("local[*]")
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/movie_db.movie_ratings")
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/movie_db.similarities")
.getOrCreate()
以下是我将结果写回MongoDB的方法:
MongoSpark.save(movieSimilarities)
其中movieSimilarities是Spark DataFrame。这里没什么特别的。只有在已经成功将一定数量的记录写入MongoDB之后,任务才会失败