如何在带有SparkContext的Play框架控制器中使用flatmap?

时间:2018-02-07 12:32:49

标签: scala apache-spark serialization playframework

我有一个使用Play 2.6,Scala 2.11和Spark 2.2.0的网络应用程序。

当我对某个变量执行org.apache.spark.SparkException: Task not serializable转换时,我收到异常:flatmap。我知道我必须在某些类中实现Serializable,但我不知道这样做的最佳实践。

异常发生在var namesRdd = names.flatMap(parseNames)行。如果我使用MyController with Serializable,我会遇到另一个错误:class invalid for deserialization。所以我认为这不是解决方案。

有谁知道如何序列化Controller以使用Spark Context和flatmap?

class SparkMarvelController @Inject()(cc: ControllerComponents) extends AbstractController(cc) with I18nSupport {

  def mostPopularSuperHero() = Action { implicit request: Request[AnyContent] =>
    val sparkContext = SparkCommons.sparkSession.sparkContext // got sparkContext

    var names = sparkContext
      .textFile("resource/marvel/Marvel-names.txt") // build up a hero ID - name RDD

    var namesRdd = names.flatMap(parseNames)

    val mostPopularHero = sparkContext
      .textFile("resource/marvel/Marvel-graph.txt") // build up superhero co-apperance data
      .map(countCoOccurrences) // convert to (hero ID, number of connections) RDD
      .reduceByKey((x, y) => x + y) // combine entries that span more than one line
      .map(x => (x._2, x._1)) // flip it to (number of connections, hero ID)
      .max // find the max connections

    // Look up the name (lookup returns an array of results, so we need to access the first result with (0))
    val mostPopularHeroName = namesRdd.lookup(mostPopularHero._2)(0)

    Ok(s"The most popular superhero is [$mostPopularHeroName] with [${mostPopularHero._1}] co-appearances.")
  }

  // Function to extract the hero ID and number of connections from each line
  def countCoOccurrences(line: String) = {
    // regex expression to split using any type of space occurrency in the line
    val elements = line.split("\\s+")
    (elements(0).toInt, elements.length - 1)
  }

  // function to extract hero ID -> hero name tuples (or None in case of Failure)
  def parseNames(line: String): Option[(Int, String)] = {
    var fields = line.split('\"')
    if (fields.length > 1) return Some(fields(0).trim.toInt, fields(1))
    else return None
  }
}

错误:

play.api.http.HttpErrorHandlerExceptions$$anon$1: Execution exception[[ClassNotFoundException: controllers.SparkMarvelController$$anonfun$mostPopularSuperHero$1$$anonfun$2]]
    at play.api.http.HttpErrorHandlerExceptions$.throwableToUsefulException(HttpErrorHandler.scala:255)
    at play.api.http.DefaultHttpErrorHandler.onServerError(HttpErrorHandler.scala:180)
    at play.core.server.AkkaHttpServer$$anonfun$3.applyOrElse(AkkaHttpServer.scala:311)
    at play.core.server.AkkaHttpServer$$anonfun$3.applyOrElse(AkkaHttpServer.scala:309)
    at scala.concurrent.Future$$anonfun$recoverWith$1.apply(Future.scala:346)
    at scala.concurrent.Future$$anonfun$recoverWith$1.apply(Future.scala:345)
    at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
    at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)
    at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:91)
    at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
Caused by: java.lang.ClassNotFoundException: controllers.SparkMarvelController$$anonfun$mostPopularSuperHero$1$$anonfun$2
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:348)
    at org.apache.spark.util.InnerClosureFinder$$anon$4.visitMethodInsn(ClosureCleaner.scala:429)
    at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
    at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
    at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
    at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)

0 个答案:

没有答案