我正在尝试在spark流应用程序和数据库查询执行语句中连接DB2数据库,导致" org.apache.spark.SparkException:任务不可序列化"的问题。请指教。以下是我参考的示例代码。
dataLines.foreachRDD{rdd=>
val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
val dataRows=rdd.map(rs => rs.value).map(row =>
row.split(",")(1)-> (row.split(",")(0), row.split(",")(1), row.split(",")(2)
, "cvflds_"+row.split(",")(3).toLowerCase, row.split(",")(4), row.split(",")(5), row.split(",")(6))
)
val db2Conn = getDB2Connection(spark,db2ConParams)
dataRows.foreach{ case (k,v) =>
val table = v._4
val dbQuery = s"(SELECT * FROM $table ) tblResult"
val df=getTableData(db2Conn,dbQuery)
df.show(2)
}
}
Below is other function code:
private def getDB2Connection(spark: SparkSession, db2ConParams:scala.collection.immutable.Map[String,String]): DataFrameReader = {
spark.read.format("jdbc").options(db2ConParams)
}
private def getTableData(db2Con: DataFrameReader,tableName: String):DataFrame ={
db2Con.option("dbtable",tableName).load()
}
object SparkSessionSingleton {
@transient private var instance: SparkSession = _
def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
instance = SparkSession
.builder
.config(sparkConf)
.getOrCreate()
}
instance
}
}
以下是错误日志:
2018-03-28 22:12:21,487 [JobScheduler] ERROR org.apache.spark.streaming.scheduler.JobScheduler - 运行作业流作业时出错1522289540000 ms.0 org.apache.spark.SparkException:任务不可序列化 在org.apache.spark.util.ClosureCleaner $ .ensureSerializable(ClosureCleaner.scala:298) 在org.apache.spark.util.ClosureCleaner $ .org $ apache $ spark $ util $ ClosureCleaner $$ clean(ClosureCleaner.scala:288) 在org.apache.spark.util.ClosureCleaner $ .clean(ClosureCleaner.scala:108) 在org.apache.spark.SparkContext.clean(SparkContext.scala:2094) 在org.apache.spark.rdd.RDD $$ anonfun $ foreach $ 1.apply(RDD.scala:916) 在org.apache.spark.rdd.RDD $$ anonfun $ foreach $ 1.apply(RDD.scala:915) 在org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:151) 在org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:112) 在org.apache.spark.rdd.RDD.withScope(RDD.scala:362) 在org.apache.spark.rdd.RDD.foreach(RDD.scala:915) at ncc.org.civil.receiver.DB2DataLoadToKudu $$ anonfun $ createSparkContext $ 1.apply(DB2DataLoadToKudu.scala:139) 在ncc.org.civil.receiver.DB2DataLoadToKudu $$ anonfun $ createSparkContext $ 1.apply(DB2DataLoadToKudu.scala:128) 在org.apache.spark.streaming.dstream.DStream $$ anonfun $ foreachRDD $ 1 $$ anonfun $ apply $ mcV $ sp $ 3.apply(DStream.scala:627) 在org.apache.spark.streaming.dstream.DStream $$ anonfun $ foreachRDD $ 1 $$ anonfun $ apply $ mcV $ sp $ 3.apply(DStream.scala:627) 在org.apache.spark.streaming.dstream.ForEachDStream $$ anonfun $ 1 $$ anonfun $ apply $ mcV $ sp $ 1.apply $ mcV $ sp(ForEachDStream.scala:51) 在org.apache.spark.streaming.dstream.ForEachDStream $$ anonfun $ 1 $$ anonfun $ apply $ mcV $ sp $ 1.apply(ForEachDStream.scala:51) 在org.apache.spark.streaming.dstream.ForEachDStream $$ anonfun $ 1 $$ anonfun $ apply $ mcV $ sp $ 1.apply(ForEachDStream.scala:51) 在org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:415) 在org.apache.spark.streaming.dstream.ForEachDStream $$ anonfun $ 1.apply $ mcV $ sp(ForEachDStream.scala:50) 在org.apache.spark.streaming.dstream.ForEachDStream $$ anonfun $ 1.apply(ForEachDStream.scala:50) 在org.apache.spark.streaming.dstream.ForEachDStream $$ anonfun $ 1.apply(ForEachDStream.scala:50) 在scala.util.Try $ .apply(Try.scala:192) 在org.apache.spark.streaming.scheduler.Job.run(Job.scala:39) 在org.apache.spark.streaming.scheduler.JobScheduler $ JobHandler $$ anonfun $ run $ 1.apply $ mcV $ sp(JobScheduler.scala:254) 在org.apache.spark.streaming.scheduler.JobScheduler $ JobHandler $$ anonfun $ run $ 1.apply(JobScheduler.scala:254) 在org.apache.spark.streaming.scheduler.JobScheduler $ JobHandler $$ anonfun $ run $ 1.apply(JobScheduler.scala:254) 在scala.util.DynamicVariable.withValue(DynamicVariable.scala:58) 在org.apache.spark.streaming.scheduler.JobScheduler $ JobHandler.run(JobScheduler.scala:253) 在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:615) 在java.lang.Thread.run(Thread.java:745) 引起:java.io.NotSerializableException:org.apache.spark.sql.DataFrameReader 序列化堆栈: - 对象不可序列化(类:org.apache.spark.sql.DataFrameReader,值:org.apache.spark.sql.DataFrameReader@15fdb01) - field(类:ncc.org.civil.receiver.DB2DataLoadToKudu $$ anonfun $ createSparkContext $ 1 $$ anonfun $ apply $ 2,name:db2Conn $ 1,type:class org.apache.spark.sql.DataFrameReader) - object(类ncc.org.civil.receiver.DB2DataLoadToKudu $$ anonfun $ createSparkContext $ 1 $$ anonfun $ apply $ 2,) 在org.apache.spark.serializer.SerializationDebugger $ .improveException(SerializationDebugger.scala:40) 在org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46) 在org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100) 在org.apache.spark.util.ClosureCleaner $ .ensureSerializable(ClosureCleaner.scala:295) ......还有30多个
答案 0 :(得分:1)
理想情况下,您应该在dataRows.foreach
中保持闭包清除任何连接对象,因为闭包是为了序列化为执行程序并在那里运行。深入了解这个概念{@ 3}}
在你的情况下,下面的行是导致问题的闭包:
val df=getTableData(db2Conn,dbQuery)
所以,不是使用spark来加载DB2表,而是在你的情况下变成(在组合方法之后):
spark.read.format("jdbc").options(db2ConParams).option("dbtable",tableName).load()
在闭包中使用普通JDBC来实现这一点。您可以在jdbc代码中使用db2ConParams
。 (我认为它很简单,可以序列化)。该链接还建议使用rdd.foreachPartition
和ConnectionPool
进一步优化。
除了df.show(2)
之外,你还没有提到你正在对表数据做什么。如果行很大,那么您可以讨论有关您的用例的更多信息。也许,你需要考虑一个不同的设计。