在我的spark流式上下文中检查指向几个文件流

时间:2016-08-19 12:33:12

标签: spark-streaming checkpointing

我已经编写了一个Spark Streaming应用程序,需要在具有底层转换的各种Dstream上做一些chekpointing,正如这个帖子(Error in starting Spark streaming context)中所建议的那样,我已经在定义的函数中完成了所有的transfomration以创建上下文,

object StreamingEngine2 {

  val filterF = { (x: Path) => true } 

  // === Configuration to control the flow of the application ===
  val stopActiveContext = true

  val batchIntervalSeconds = 10
  val eventsPerSecond = 10 // For the dummy source



  var newContextCreated = false // Flag to detect whether new context was created or not

  // Function to create a new StreamingContext and set it up
  val creatingFunc = { () =>

    //add winutil to avoid: ERROR org.apache.spark.streaming.scheduler.JobScheduler - Error generating jobs for time 1471237080000 ms java.lang.NullPointerException
    System.setProperty("hadoop.home.dir", "C:\\hadoop")
    println("Creating function called to create new StreamingContext")

    val conf = new SparkConf().setMaster("local[10]").setAppName("FileStreaming").set("spark.streaming.fileStream.minRememberDuration", "2000000h") /*.set("SPARK_CONF_DIR","src/main/resources")*/.registerKryoClasses(Array(classOf[org.apache.hadoop.io.LongWritable],
      classOf[org.apache.hadoop.io.Text], classOf[GGSN]))


    // Verify that the attached Spark cluster is 1.4.0+
    val sc = new SparkContext(conf)
    require(sc.version.replace(".", "").substring(0, 3).toInt >= 160, "Spark 1.6.0+ is required to run this code. Please attach it to a Spark 1.6.0+ cluster.")



    val ssc = new StreamingContext(sc, Seconds(batchIntervalSeconds))
    ssc.checkpoint("c:\\checkpoints")


    val ggsnFileLines = ssc.fileStream[LongWritable, Text, TextInputFormat]("C:\\Users\\Mbazarganigilani\\Documents\\RA\\GGSN\\Files", filterF, false)
    val ccnFIleLines = ssc.fileStream[LongWritable, Text, TextInputFormat]("C:\\Users\\Mbazarganigilani\\Documents\\RA\\CCN\\Files1", filterF, false)



    //some mapping and transfomration

    probeFileLines.checkpoint(Duration(batchIntervalSeconds*1000*5))
    ggsnFileLines.checkpoint(Duration(batchIntervalSeconds*1000*5))


    //check GGSSN...
    probeFileLines.foreachRDD(s=>
    {

        println(s.count())


   }



    )


    ssc.remember(Minutes(1)) // To make sure data is not deleted by the time we query it interactively



    newContextCreated = true
    ssc
  }

  def main(args: Array[String]): Unit = {



    System.setProperty("hadoop.home.dir", "C:\\hadoop")

    if (stopActiveContext) {
      StreamingContext.getActive.foreach {
        _.stop(stopSparkContext = false)
      }
    }

    val ssc=StreamingContext.getOrCreate("c:\\checkpoints", creatingFunc)


    if (newContextCreated) {
      println("New context created from currently defined creating function")
    } else {
      println("Existing context running or recovered from checkpoint, may not be running currently defined creating function")
    }

    // Start the streaming context in the background.
    ssc.start()

    // This is to ensure that we wait for some time before the background streaming job starts. This will put this cell on hold for 5 times the batchIntervalSeconds.
    ssc.awaitTerminationOrTimeout(batchIntervalSeconds * 2 * 1000*1000)


  }


}

但是,当我从检查点目录

加载我的上下文时,我仍然得到异常
16/08/19 22:28:01 ERROR Utils: Exception encountered
java.lang.NullPointerException
    at org.apache.spark.streaming.dstream.DStreamCheckpointData$$anonfun$writeObject$1.apply$mcV$sp(DStreamCheckpointData.scala:126)
    at org.apache.spark.streaming.dstream.DStreamCheckpointData$$anonfun$writeObject$1.apply(DStreamCheckpointData.scala:124)
    at org.apache.spark.streaming.dstream.DStreamCheckpointData$$anonfun$writeObject$1.apply(DStreamCheckpointData.scala:124)
    at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1253)
    at org.apache.spark.streaming.dstream.DStreamCheckpointData.writeObject(DStreamCheckpointData.scala:124)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:988)
    at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496)
    at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
    at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
    at java.io.ObjectOutputStream.defaultWriteObject(ObjectOutputStream.java:441)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$writeObject$1.apply$mcV$sp(DStream.scala:516)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$writeObject$1.apply(DStream.scala:511)
    at org.apache.spark.streaming.dstream.DStream$$anonfun$writeObject$1.apply(DStream.scala:511)
    at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1253)
    at org.apache.spark.streaming.dstream.DStream.writeObject(DStream.scala:511)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:988)
    at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496)
    at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
    at java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1378)
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1174)
    at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
    at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
    at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
    at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
    at java.io.ObjectOutputStream.defaultWriteObject(ObjectOutputStream.java:441)
    at org.apache.spark.streaming.DStreamGraph$$anonfun$writeObject$1.apply$mcV$sp(DStreamGraph.scala:182)
    at org.apache.spark.streaming.DStreamGraph$$anonfun$writeObject$1.apply(DStreamGraph.scala:177)
    at org.apache.spark.streaming.DStreamGraph$$anonfun$writeObject$1.apply(DStreamGraph.scala:177)
    at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1253)
    at org.apache.spark.streaming.DStreamGraph.writeObject(DStreamGraph.scala:177)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:988)
    at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496)
    at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
    at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
    at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
    at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
    at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
    at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
    at org.apache.spark.streaming.Checkpoint$$anonfun$serialize$1.apply$mcV$sp(Checkpoint.scala:143)
    at org.apache.spark.streaming.Checkpoint$$anonfun$serialize$1.apply(Checkpoint.scala:143)
    at org.apache.spark.streaming.Checkpoint$$anonfun$serialize$1.apply(Checkpoint.scala:143)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1287)
    at org.apache.spark.streaming.Checkpoint$.serialize(Checkpoint.scala:144)
    at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:525)
    at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:573)
    at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:572)
    at StreamingEngine2$.main(StreamEngine2.scala:694)
    at StreamingEngine2.main(StreamEngine2.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

任何人都可以帮助我在我的代码中解决这个问题,我已经在Spark 1.6和2.0中尝试了我的代码但得到了相同的异常。

1 个答案:

答案 0 :(得分:1)

好的,问题是由于几个流而且我每次都应该对它们进行计数...但是我不确定我应该在哪里调用count以及我应该为每个流放置检查点..