解耦非可序列化对象以避免Spark中的序列化错误

时间:2016-03-10 21:44:28

标签: scala serialization apache-spark

以下类包含尝试从Elasticsearch读取并打印返回文档的主函数:

object TopicApp extends Serializable {

  def run() {

    val start = System.currentTimeMillis()

    val sparkConf = new Configuration()
    sparkConf.set("spark.executor.memory","1g")
    sparkConf.set("spark.kryoserializer.buffer","256")

    val es = new EsContext(sparkConf)
    val esConf = new Configuration()
    esConf.set("es.nodes","localhost")
    esConf.set("es.port","9200")
    esConf.set("es.resource", "temp_index/some_doc")
    esConf.set("es.query", "?q=*:*")
    esConf.set("es.fields", "_score,_id")

    val documents = es.documents(esConf)
    documents.foreach(println)

    val end = System.currentTimeMillis()
    println("Total time: " + (end-start) + " ms")

    es.shutdown()

  }

  def main(args: Array[String]) {
    run()
  }

}

以下类使用org.json4s

将返回的文档转换为JSON
class EsContext(sparkConf:HadoopConfig) extends SparkBase {
  private val sc = createSCLocal("ElasticContext", sparkConf)

  def documentsAsJson(esConf:HadoopConfig):RDD[String] = {
    implicit val formats = DefaultFormats
    val source = sc.newAPIHadoopRDD(
      esConf,
      classOf[EsInputFormat[Text, MapWritable]],
      classOf[Text],
      classOf[MapWritable]
    )
    val docs = source.map(
      hit => {
        val doc = Map("ident" -> hit._1.toString) ++ mwToMap(hit._2)
        write(doc)
      }
    )
    docs
  }

  def shutdown() = sc.stop()

  // mwToMap() converts MapWritable to Map

}

以下课程为应用程序创建本地SparkContext

trait SparkBase extends Serializable {
  protected def createSCLocal(name:String, config:HadoopConfig):SparkContext = {
    val iterator = config.iterator()
    for (prop <- iterator) {
      val k = prop.getKey
      val v = prop.getValue
      if (k.startsWith("spark."))
        System.setProperty(k, v)
    }
    val runtime = Runtime.getRuntime
    runtime.gc()

    val conf = new SparkConf()
    conf.setMaster("local[2]")

    conf.setAppName(name)
    conf.set("spark.serializer", classOf[KryoSerializer].getName)

    conf.set("spark.ui.port", "0")

    new SparkContext(conf)
  }
}

当我运行TopicApp时,我收到以下错误:

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
    at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:324)
    at org.apache.spark.rdd.RDD$$anonfun$map$1.apply(RDD.scala:323)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.RDD.map(RDD.scala:323)
    at TopicApp.EsContext.documents(EsContext.scala:51)
    at TopicApp.TopicApp$.run(TopicApp.scala:28)
    at TopicApp.TopicApp$.main(TopicApp.scala:39)
    at TopicApp.TopicApp.main(TopicApp.scala)
Caused by: java.io.NotSerializableException: org.apache.spark.SparkContext
Serialization stack:
    - object not serializable (class: org.apache.spark.SparkContext, value: org.apache.spark.SparkContext@14f70e7d)
    - field (class: TopicApp.EsContext, name: sc, type: class org.apache.spark.SparkContext)
    - object (class TopicApp.EsContext, TopicApp.EsContext@2cf77cdc)
    - field (class: TopicApp.EsContext$$anonfun$documents$1, name: $outer, type: class TopicApp.EsContext)
    - object (class TopicApp.EsContext$$anonfun$documents$1, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
    ... 13 more

通过涵盖类似问题的其他帖子,大多数建议创建类Serializable或尝试将不可序列化的对象与类分开。

从我得到的错误中我推断出SparkContextsc是不可序列化的,因为SparkContext不是可序列化的类。

我应该如何解耦SparkContext,以便应用程序正确运行?

1 个答案:

答案 0 :(得分:1)

我无法运行您的程序,但一般规则是不创建匿名函数,如果必须在RDD的数据上执行,则引用不可序列化类的成员。在你的情况下:

  • EsContext的val类型为SparkContext,(有意)不可序列化
  • RDD.map中传递给EsContext.documentsAsJson的匿名函数中,您调用此EsContext实例(mwToMap)的另一个函数,该函数强制Spark序列化该实例,以及它拥有的SparkContext

一种可能的解决方案是从mwToMap类中移除EsContext可能转换为EsContext的伴随对象 - 对象无需序列化,因为它们是静态的)。如果还有其他相同性质的方法(write?),那么它们也必须被移动。这看起来像是:

import EsContext._

class EsContext(sparkConf:HadoopConfig) extends SparkBase {
   private val sc = createSCLocal("ElasticContext", sparkConf)

   def documentsAsJson(esConf: HadoopConfig): RDD[String] = { /* unchanged */ }
   def documents(esConf: HadoopConfig): RDD[EsDocument] = { /* unchanged */ }
   def shutdown() = sc.stop()
}

object EsContext {
   private def mwToMap(mw: MapWritable): Map[String, String] = { ... }
}

如果移出这些方法是不可能的(即如果它们需要一些EsContext的成员) - 那么考虑将实际映射的类与此上下文分开(这似乎是成为SparkContext的某种包装 - 如果它是什么,它应该是所有