Question

我已使用以下配置配置了spark作业，该配置在我的本地计算机上运行良好，但在EMR群集上引发了以下异常

scalaVersion in Global := "2.11.8"

crossScalaVersions in Global := Seq("2.11.8")

依赖关系如下。

val ScallopVersion = "3.1.2"
  val SparkVersion = "2.4.0"
  val OpenCsvVersion = "3.7"
  val MockitoVersion = "1.10.19"
  val ConfigVersion = "1.2.1"
  val ScalazVersion = "7.3.0-M31"    

  val ConfigDependencies = Seq("com.typesafe" % "config" % ConfigVersion)

  val ParserDependencies = Seq("com.opencsv" % "opencsv" % OpenCsvVersion)

  val JodaTimeDependencies = Seq(
    "joda-time" % "joda-time" % JodaTimeVersion % Provided,
    "org.joda" % "joda-convert" % JodaConvertVersion % Provided)

  val KryoDependencies = Seq("com.esotericsoftware.kryo" % "kryo" % KryoVersion % Provided)

  val PredictiveCommonDependencies = Seq("com.here.traffic.predictive" % "predictive-common_2.10" % PredictiveCommonVersion)

  val SparkDependencies = Seq(
    "org.apache.spark" %% "spark-core" % SparkVersion % Provided excludeAll ExclusionRule("org.apache.hadoop"),
    "org.apache.spark" %% "spark-core" % SparkVersion % Test excludeAll ExclusionRule("org.apache.hadoop"),
    "org.apache.hadoop" % "hadoop-client" % HadoopVersion % Provided excludeAll(ExclusionRule(organization = "javax.servlet"), ExclusionRule(organization = "org.mortbay.jetty")),
    "org.apache.hadoop" % "hadoop-client" % HadoopVersion % Test excludeAll(ExclusionRule(organization = "javax.servlet"), ExclusionRule(organization = "org.mortbay.jetty")),
    "com.google.guava" % "guava" % GuavaVersion
  )

  val FunctionalProgrammingDependencies = Seq("org.scalaz" % "scalaz-core_2.11" % ScalazVersion)

  val ScallopDependencies = Seq("org.rogach" %% "scallop" % ScallopVersion)

ERROR ApplicationMaster: User class threw exception: org.apache.spark.SparkException: Task not serializable
org.apache.spark.SparkException: Task not serializable
at com.here.traffic.learning.steps.MapMatcherStep$.apply(MapMatcherStep.scala:44)
    at com.here.traffic.learning.steps.TASteps$.runJob(TASteps.scala:22)
    at com.here.traffic.learning.steps.TASteps$.main(TASteps.scala:13)
    at com.here.traffic.learning.steps.TASteps.main(TASteps.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:678)
Caused by: java.io.UTFDataFormatException: encoded string too long: 89216 bytes
    at java.io.DataOutputStream.writeUTF(DataOutputStream.java:364)
    at java.io.DataOutputStream.writeUTF(DataOutputStream.java:323)
    at com.typesafe.config.impl.SerializedConfigValue.writeValueData(SerializedConfigValue.java:301)

//在分析字符串时抛出下一行。

com.here.traffic.learning.steps.MapMatcher步骤第44行

    val pdrProbes: RDD[MapMatchInputPoint] = rawProbes.flatMap(ss=> pdrParser.parse(ss))

Spark Job投掷任务不可序列化。原因：java.io.UTFDataFormatException：编码的字符串太长：89216字节

0 个答案: