当谈到火花和大数据处理时,我很天真。我正在写一个火花作业,我从kafka消耗流并在该流上进行多个聚合,有时我必须从kafka直接流创建窗口流。
目前我只有两个聚合要执行,但将来我可能会对多个时间间隔的寡妇进行多次聚合。
批处理间隔为60秒。
以下是我的代码。
val streamingContext = new StreamingContext(sc, Seconds(60));
val rawStream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams));
// transform creates a new rdd so I thought if I create a new rdd than each of different aggregations will have different rdd's to perform actions
val rViewStream = rawStream.transform((rdd) => rdd);
val rawStreamValues = rViewStream.map(x => x.value());
val windowStream = rawStreamValues.window(Minutes(10), Seconds(60));
windowStream.foreachRDD(windowRDD => ({
var windowDf = sqlContext.read.json(windowRDD);
// perform some aggregation on this data frame and push results to redis
}));
rawStream.foreachRDD(x => ({
// perform some aggregation/transformation on this stream and save to hdfs.
val stringRDD = x.map(cr => ({
cr.value();
}))
val rawDf = sqlContext.read.json(stringRDD); <---- exception here
}))
完整的异常堆栈
Caused by: java.lang.InterruptedException
at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:998)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1304)
at scala.concurrent.impl.Promise$DefaultPromise.tryAwait(Promise.scala:202)
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:218)
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:153)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1988)
at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1089)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.fold(RDD.scala:1083)
at org.apache.spark.sql.execution.datasources.json.InferSchema$.infer(InferSchema.scala:69)
at org.apache.spark.sql.DataFrameReader$$anonfun$3.apply(DataFrameReader.scala:329)
at org.apache.spark.sql.DataFrameReader$$anonfun$3.apply(DataFrameReader.scala:329)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:328)