线程“main”中的异常org.apache.spark.SparkException:对于带有scala 2.10的dateFormat,任务不可序列化

时间:2017-03-13 19:20:21

标签: json scala apache-spark

当我使用scala版本2.11时,这是有效的,但当我降级到2.10时,这不起作用。即使我使用scala的json4s v2.10。

我尝试添加原生格式,但无法成功。有人可以提供一些如何解决它的信息。

package com.golfbreaks.spark.streaming

import java.text.SimpleDateFormat

import com.golfbreaks.quote.Quote
import com.google.gson.{Gson, GsonBuilder}
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.json4s._
import org.json4s.native.JsonParser


object Test {

  def main(args: Array[String]): Unit = {

    implicit val formats = DefaultFormats
      //DefaultFormats

    val zkQuorum = "quickstart.cloudera"
    val topics = "sf_quotes"
    val group = "group1"
    val numThreads = 2
    val tableName: TableName = TableName.valueOf("sf_quotes")

    //val Array(zkQuorum, group, topics, numThreads) = args
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("GolfBreaksStreamingQuotes")
    val ssc = new StreamingContext(sparkConf, Seconds(2))

    val gson: Gson = new GsonBuilder().serializeNulls().create()

    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
    val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap, StorageLevel.MEMORY_ONLY)
      .filter(_._2 != null)
      .map {
        //case (_, json) => Quote.parseQuote(json)
        //case (_, json) => json
        //read[Quote](json)
        case (_, json) => JsonParser.parse(json).extract[Quote]
      }
    //.reduceByKeyAndWindow(_ + _, _ - _, Minutes(1), Seconds(2), 2)
    //lines.foreachRDD( rdd => rdd.foreach(println))

    //println(json);

    /*val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", "instance-26765.bigstep.io,instance-26766.bigstep.io,instance-26767.bigstep.io")
    conf.set("hbase.master", "instance-26765.bigstep.io:60000")
    conf.setInt("timeout", 120000)*/

    //lines.print()

    lines.transform(rdd => {
      val hbaseTableName = "sf_quotes"
      //Creates the HBase confs
      val hconf = HBaseConfiguration.create()
      hconf.set("hbase.zookeeper.quorum", "quickstart.cloudera")
      //hconf.set("hbase.zookeeper.quorum", "instance-26765.bigstep.io,instance-26766.bigstep.io,instance-26767.bigstep.io,instance-26768.bigstep.io")
      hconf.set("hbase.zookeeper.property.clientPort", "2181")
      hconf.set("hbase.defaults.for.version.skip", "true")
      val job = Job.getInstance(hconf)
      val jobConf = job.getConfiguration
      jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName.getNameAsString)
      job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
      //hconf.set(TableOutputFormat.OUTPUT_TABLE, hbaseTableName)
      //hconf.setClass("mapreduce.job.outputformat.class", classOf[TableOutputFormat[String]], classOf[OutputFormat[String, Mutation]])

      /*rdd.map {
        jsonStr => gson.fromJson(jsonStr,classOf[Quote])
      }*/
      rdd.map(quote => (new ImmutableBytesWritable, {

        val quoteColumnFamily = Bytes.toBytes("quote")
        val putRecord = new Put(Bytes.toBytes(quote.Id))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Id"), Bytes.toBytes(quote.Id))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Name"), Bytes.toBytes(quote.Name))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Opportunity__c"), Bytes.toBytes(quote.Opportunity__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Account__c"), Bytes.toBytes(quote.Account__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("SalesChannel__c"), Bytes.toBytes(quote.SalesChannel__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("PrimaryVenue__c"), Bytes.toBytes(quote.PrimaryVenue__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("SalesRegion__c"), Bytes.toBytes(quote.SalesRegion__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("SalePriceGross__c"), Bytes.toBytes(quote.SalePriceGross__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("CostPriceGross__c"), Bytes.toBytes(quote.CostPriceGross__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("GrossProfit__c"), Bytes.toBytes(quote.GrossProfit__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("GrossProfitPercent__c"), Bytes.toBytes(quote.GrossProfitPercent__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("NumberOfGuest__c"), Bytes.toBytes(quote.NumberOfGuest__c))
        putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Gross_profit_per_person__c"), Bytes.toBytes(quote.Gross_profit_per_person__c))

        putRecord
      }
      )
      ).saveAsNewAPIHadoopDataset(jobConf)
      rdd
    }).print()
    println("**************************************************************")
    ssc.start()
    println("start Streaming")
    ssc.awaitTermination()
  }
}

// scalastyle:on println

package com.golfbreaks.quote

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.Row

/**
  * Created by rapolu on 07/03/2017.
  */
case class Quote(Id: String,
                 Name: String = "",
                 Opportunity__c: String = "",
                 Account__c: String = "",
                 SalesChannel__c: String = "",
                 PrimaryVenue__c: String = "",
                 SalesRegion__c: String = "",
                 SalePriceGross__c: String = "0.0",
                 CostPriceGross__c: String = "0.0",
                 GrossProfit__c: String = "0.0",
                 GrossProfitPercent__c: String = "0.0",
                 NumberOfGuest__c: String = "0",
                 Gross_profit_per_person__c: String = "0.0") extends Serializable



object Quote extends Serializable{
  def parseQuote(str: String):Unit = {
    def apply(r: Row): Quote =
      Quote(r.getString(0),r.getString(1),r.getString(2),r.getString(3),r.getString(4),r.getString(5),r.getString(6),
        r.getString(7),r.getString(8),r.getString(9),r.getString(10), r.getString(11),r.getString(12)
        //,        r.getString(13),r.getString(13)
      )
  }

  def convertToPut(quote: Quote): (ImmutableBytesWritable,Put) = {
    val quoteColumnFamily = Bytes.toBytes("quote")
    val putRecord = new Put(Bytes.toBytes(quote.Id))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Id"), Bytes.toBytes(quote.Id))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Name"), Bytes.toBytes(quote.Name))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Opportunity__c"), Bytes.toBytes(quote.Opportunity__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Account__c"), Bytes.toBytes(quote.Account__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("SalesChannel__c"), Bytes.toBytes(quote.SalesChannel__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("PrimaryVenue__c"), Bytes.toBytes(quote.PrimaryVenue__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("SalesRegion__c"), Bytes.toBytes(quote.SalesRegion__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("SalePriceGross__c"), Bytes.toBytes(quote.SalePriceGross__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("CostPriceGross__c"), Bytes.toBytes(quote.CostPriceGross__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("GrossProfit__c"), Bytes.toBytes(quote.GrossProfit__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("GrossProfitPercent__c"), Bytes.toBytes(quote.GrossProfitPercent__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("NumberOfGuest__c"), Bytes.toBytes(quote.NumberOfGuest__c))
    putRecord.addColumn(quoteColumnFamily, Bytes.toBytes("Gross_profit_per_person__c"), Bytes.toBytes(quote.Gross_profit_per_person__c))
    //addPutToList(putRecord)
    (new ImmutableBytesWritable(Bytes.toBytes(quote.Id)),putRecord)
  }
  //,  @JsonProperty("CreatedDate") CreateDate: String,  @JsonProperty("LastModifiedDate") LastModifiedDate: String) extends Quote
}

错误

Exception in thread "main" org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2067)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:558)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:558)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.SparkContext.withScope(SparkContext.scala:726)
    at org.apache.spark.streaming.StreamingContext.withScope(StreamingContext.scala:260)
    at org.apache.spark.streaming.dstream.DStream.map(DStream.scala:557)
    at com.golfbreaks.spark.streaming.Test$.main(Test.scala:48)
    at com.golfbreaks.spark.streaming.Test.main(Test.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:606)
    at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)
Caused by: java.io.NotSerializableException: org.json4s.DefaultFormats$$anon$4
Serialization stack:
    - object not serializable (class: org.json4s.DefaultFormats$$anon$4, value: org.json4s.DefaultFormats$$anon$4@3b954661)
    - field (class: com.golfbreaks.spark.streaming.Test$$anon$1, name: dateFormat, type: interface org.json4s.DateFormat)
    - object (class com.golfbreaks.spark.streaming.Test$$anon$1, com.golfbreaks.spark.streaming.Test$$anon$1@3e8de7fd)
    - field (class: com.golfbreaks.spark.streaming.Test$$anonfun$3, name: formats$1, type: interface org.json4s.DefaultFormats)
    - object (class com.golfbreaks.spark.streaming.Test$$anonfun$3, <function1>)
    at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
    at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
    at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
    ... 17 more

1 个答案:

答案 0 :(得分:0)

在序列化堆栈之后,似乎std::string foofullPath = resolvePath("test/data/foo.txt"); 无法序列化存在问题。您可以在闭包中显式地传递它们,或者在闭包中将它们定义为隐式val,而不是在开始时隐式定义它们。

我不确定哪个方法请求隐式参数,我将假设它为formats。如果是这种情况,您可以尝试以下方式:

extract[T]