我开发了一个流媒体应用程序,它使用多个文件流,然后将它们联合起来。我使用检查点从检查点数据重新加载..从检查点数据进行4,5次重新编码。我的应用程序给了我以上异常..我的代码如下:
def creatingFunc(): StreamingContext = {
//System.setProperty("hadoop.home.dir", "C:\\hadoop")
val conf = new SparkConf().setAppName("FileStreaming").set("spark.streaming.fileStream.minRememberDuration", "2000000h").set("spark.executor.instances","2") /*.set("SPARK_CONF_DIR","src/main/resources")*/
.registerKryoClasses(Array(classOf[org.apache.hadoop.io.LongWritable]))//.setMaster("local[7]")
// Verify that the attached Spark cluster is 1.4.0+
val sc = new SparkContext(conf)
require(sc.version.replace(".", "").substring(0, 3).toInt >= 160, "Spark 1.6.0+ is required to run this code. Please attach it to a Spark 1.6.0+ cluster.")
// Create a StreamingContext
val ssc = new StreamingContext(sc, Seconds(batchIntervalSeconds))
ssc.checkpoint("/mapr/cellos-mapr/user/mbazarganigilani/SparkStreaming1/src/main/checkpoints")
val funcGSSNFilterHeader = (x: String) => {
!x.contains("servedMSISDN")
}
val funcCCNFilterHeader = (x: String) => {
!x.contains("resultCode")
}
val unionProbeStreams=(1 to 2).map(i=>i match {
case 1 => {
val ggsnArray = ssc.fileStream[LongWritable, Text, TextInputFormat]("/mapr/cellos-mapr/user/mbazarganigilani/SparkStreaming1/src/main/GGSN", filterF, false)
.map(x => x._2.toString()).filter(x => funcGSSNFilterHeader(x)).map(x => {
x.split(",")
}).map(x => x.length match {
case 25 => new SIMPLE_GGSN(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11), x(12), x(13), x(14), new SIMPLE_GGSN_2(x(15), x(16), x(17), x(18), x(19), x(20), x(21), x(22), x(23), x(24)))
case _ => new SIMPLE_GGSN("Invalid GGSN CDR", "", "", "", "", "", "", "", "", "", "", "", "", "", "", new SIMPLE_GGSN_2("", "", "", "", "", "", "", "", "", ""))
}).map(x => (new SIMPLE_KEY_JOINS(x.IMSI, x.CAHRGING_ID), x)).map(x => (x._1, x._2.IMSI))
ggsnArray
}
case 2 => {
val ccnArray=ssc.fileStream[LongWritable, Text, TextInputFormat]("/mapr/cellos-mapr/user/mbazarganigilani/SparkStreaming1/src/main/CCN", filterF, false)
.map(x => x._2.toString()).filter(x => funcCCNFilterHeader(x)).map(x => x.split(",")).map(x => x.length match {
case 43 => new SIMPLE_CCN(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10), x(11), x(12), x(13), x(14), x(15), x(16), x(17),
x(18), x(19), x(20), new SIMPLE_CCN_2(x(21), x(22), x(23), x(24), x(25), x(26), x(27), x(28), x(29), x(30), x(31), x(32), x(33), x(34), x(35), x(36), x(37), x(38), x(39), x(40), x(41), x(42)))
case _ => new SIMPLE_CCN("Invalid CCN CDR", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", new SIMPLE_CCN_2("", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""))
}).map(x => (new SIMPLE_KEY_JOINS(x.IMSI, x.ccn_2.CHARGINGCONTEXT_16778226), x)).map(x => (x._1, x._2.IMSI))
ccnArray
}
})
val joined=ssc.union(unionProbeStreams)
joined.checkpoint(Duration(batchIntervalSeconds * 1000 * 5))
//joined.foreachRDD(_.count())
joined.foreachRDD(y=> {
println("this count for joint is "+ y.count())
//y.foreach(x=> println(x))
})
ssc.remember(Minutes(1)) // To make sure data is not deleted by the time we query it interactively
println("Creating function called to create new StreamingContext")
newContextCreated = true
ssc
}
在从检查点数据加载几次后运行我的应用程序时,我得到了以下异常:
java.lang.IllegalStateException: SparkContext has been shutdown
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD.count(RDD.scala:1157)
at UnionStream$$anonfun$creatingFunc$5.apply(UnionStreaming.scala:453)
at UnionStream$$anonfun$creatingFunc$5.apply(UnionStreaming.scala:451)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
2016-09-27 12:58:15,025 ERROR [JobScheduler] scheduler.JobScheduler: Error running job streaming job 1474943750000 ms.4
我正在使用Spark Streaming 1.6.1 ..