NullPointerException:在foreachPartition / foreach中创建数据集/数据帧

时间:2017-10-26 21:38:44

标签: scala apache-spark nullpointerexception apache-spark-sql

1)如果我在本地和群集模式下使用以下一个,我会收到NullPointerException错误

import sparkSession.implicits._
val testDS = sparkSession.createDataFrame(
  Seq(
    ABC("1","2", 1),
    ABC("3","9", 3),
    ABC("8","2", 2),
    ABC("1","2", 3),
    ABC("3","9", 1),
    ABC("2","7", 1),
    ABC("1","3", 2))
).as[ABC]

val t = testDS
  .rdd
  .groupBy(_.c)
  .foreachPartition(
    p => p.foreach(
      a => {
        val id = a._1
        println("inside foreach, id: " + id)
        val itABC = a._2

        val itSeq = itABC.toSeq
        println(itSeq.size)

        val itDS = itSeq.toDS // Get "Caused by: java.lang.NullPointerException" here
        itDS.show()

        funcA(itDS, id)
      }
    )
  )
println(t.toString)

import sparkSession.implicits._
val testDS = sparkSession.createDataFrame(
  Seq(
    ABC("1","2", 1),
    ABC("3","9", 3),
    ABC("8","2", 2),
    ABC("1","2", 3),
    ABC("3","9", 1),
    ABC("2","7", 1),
    ABC("1","3", 2))
).as[ABC]

testDS
  .rdd
  .groupBy(_.c)
  .foreachPartition(
    p => p.foreach(
      a => {
        val id = a._1
        println("inside foreach, id: " + id)
        val itABC = a._2

        import sparkSession.implicits._
        val itDS = sparkSession.createDataFrame( 
          sparkSession.sparkContext.parallelize(itABC.toList, numSlices=200)) // get "NullPointerException" here
        itDS.show()

        funcA(itDS, id)
      }
    )
  )

这是1)的输出日志:

    17/10/26 15:07:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 4) / 4]17/10/26 15:07:29 WARN TaskSetManager: Lost task 0.0 in stage 2.0 (TID 8, 10.142.17.137, executor 0): java.lang.NullPointerException
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1$$anonfun$apply$1.apply(SL.scala:176)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1$$anonfun$apply$1.apply(SL.scala:167)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1.apply(SL.scala:166)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1.apply(SL.scala:166)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)

17/10/26 15:07:29 ERROR TaskSetManager: Task 0 in stage 2.0 failed 4 times; aborting job
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 4 times, most recent failure: Lost task 0.3 in stage 2.0 (TID 12, 10.142.17.137, executor 0): java.lang.NullPointerException
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1$$anonfun$apply$1.apply(SL.scala:176)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1$$anonfun$apply$1.apply(SL.scala:167)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1.apply(SL.scala:166)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1.apply(SL.scala:166)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:926)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:924)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:924)
    at com.a.data_pipeline.SL.generateScaleGraphs(SL.scala:165)
    at com.a.data_pipeline.GA$$anonfun$generateGraphsDataScale$1.apply(GA.scala:23)
    at com.a.data_pipeline.GA$$anonfun$generateGraphsDataScale$1.apply(GA.scala:21)
    at scala.collection.immutable.List.foreach(List.scala:381)
    at com.a.data_pipeline.GA$.generateGraphsDataScale(GA.scala:21)
    at com.a.data_pipeline.GA$.main(GA.scala:52)
    at com.a.data_pipeline.GA.main(GA.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:755)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NullPointerException
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1$$anonfun$apply$1.apply(SL.scala:176)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1$$anonfun$apply$1.apply(SL.scala:167)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1.apply(SL.scala:166)
    at com.a.data_pipeline.SL$$anonfun$generateScaleGraphs$1.apply(SL.scala:166)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:926)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)

2)但是,如果我使用以下代码,在本地模式下运行正常,但在群集模式下运行,我得到NullPointerExceptionCaused by: org.apache.spark.SparkException: A master URL must be set in your configuration

import sparkSession.implicits._
val testDS = sparkSession.createDataFrame(
  Seq(
    ABC("1","2", 1),
    ABC("3","9", 3),
    ABC("8","2", 2),
    ABC("1","2", 3),
    ABC("3","9", 1),
    ABC("2","7", 1),
    ABC("1","3", 2))
).as[ABC]

val test = testDS
  .rdd
  .groupBy(_.c)
  .foreachPartition(
    p => p.foreach(
      a => {
        val id = a._1
        println("inside foreach, id: " + id)
        val itABC = a._2
        val ss = SparkSessionUtil.getInstance(clusterMode)
        import ss.implicits._
        val itDS = ss.createDataFrame(
        ss.sparkContext.parallelize(itABC.toList, numSlices=200)).as[ABC]
        itDS.show()
        funcA(itDS, id)  // in funcA, I'd like to use this itDS(Dataset) to do some calculation, like itDS.groupby().agg().filter()
      }
    )
  )

这是2)的系统输出日志:

17/10/26 14:19:12 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
inside foreach, id: 1
17/10/26 14:19:13 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  1|
|  3|  9|  1|
|  2|  7|  1|
+---+---+---+

inside foreach, id: 2
17/10/26 14:19:14 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
17/10/26 14:19:14 WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect.
+---+---+---+
|  a|  b|  c|
+---+---+---+
|  8|  2|  2|
|  1|  3|  2|
+---+---+---+

inside foreach, id: 3
+---+---+---+
|  a|  b|  c|
+---+---+---+
|  3|  9|  3|
|  1|  2|  3|
+---+---+---+

我想在id related Dataset(itDS)中使用funcA(itDS, id)来计算类似itDS.groupby().agg().filter()的内容,我该如何解决这个问题?提前谢谢你?

1 个答案:

答案 0 :(得分:2)

最近遇到了相同的问题,由于没有答案,因此尝试添加回答此问题。  faustineinsun评论是答案:

  

谢谢你,@ AlexandreDupriez!问题已经解决了   将代码从sparkSession.sql()重组为Seq [ABC],以便   map / foreach函数闭包中未引用sparkSession,   由于sparkSession不可序列化,因此可以在   司机不上班族

结论: 在foreachforeachPartitionmapmapPartitions中,您无法在内部创建一个带有spark会话.read或.sql的新数据帧,它将抛出空指针异常。

也可以看看:

How to use SQLContext and SparkContext inside foreachPartition