以下代码与我要执行的操作很接近。我有一个数据帧的RDD,我想将这些数据帧附加到最后构成1个数据帧。
import org.apache.spark.sql.SparkSession
//scalastyle:off
object test{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Ingester")
.master("local[2]")
.getOrCreate()
val rdd = spark.sparkContext.parallelize(List(spark.emptyDataFrame, spark.emptyDataFrame))
rdd
.fold(spark.emptyDataFrame)(_.union(_))
}
}
出于某种原因,尝试执行此操作时会得到NPE。
Caused by: java.lang.NullPointerException
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:73)
at org.apache.spark.sql.Dataset.withSetOperator(Dataset.scala:3313)
at org.apache.spark.sql.Dataset.union(Dataset.scala:1834)
at test$$anonfun$main$1.apply(test.scala:14)
at test$$anonfun$main$1.apply(test.scala:14)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:155)
at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:155)
at scala.collection.Iterator$class.foreach(Iterator.scala:742)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:155)
at org.apache.spark.InterruptibleIterator.foldLeft(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.fold(TraversableOnce.scala:210)
at org.apache.spark.InterruptibleIterator.fold(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$fold$1$$anonfun$19.apply(RDD.scala:1096)
at org.apache.spark.rdd.RDD$$anonfun$fold$1$$anonfun$19.apply(RDD.scala:1096)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2130)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2130)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
有人知道为什么会这样吗?