我正在为Kafka和SparkStreaming处理一些代码,当我将它们放在Yarn-Cluster上时,它报告了NullPointerException
。
但是在我的计算机上(独立模式)它运行良好
那它怎么了?
//这是代码
import java.util.Properties
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DealLog extends App {
val spark=SparkSession.builder().appName(" DealLog").getOrCreate()
val sc = spark.sparkContext
val ssc: StreamingContext= new StreamingContext(sc, Seconds(3))
val log = Logger.getLogger(this.getClass)
val pro = new Properties()
val in = Thread.currentThread().getContextClassLoader.getResourceAsStream("config.properties")
pro.load(in)
// ssc.checkpoint("hdfs://192.168.0.240:8022/bigdata/checkpoint2")
val bootstrap=pro.getProperty("kafka.brokers")
val kafkaParams = Map[String, Object]("bootstrap.servers" -> bootstrap,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "userlabel",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean)
)
val topicsSet = Array(pro.getProperty("kafkaconsume.topic"))
val ds = KafkaUtils.createDirectStream[String,String](
ssc,
PreferConsistent,
Subscribe[String,String](topicsSet,kafkaParams)
).map(s=>{(s.value())})
ds.foreachRDD(p=>{
log.info("ds.foreachRdd p=="+ p)
p.foreachPartition(per=>{
log.info("per-------"+per)
per.foreach(rdd=> {
log.info("rdd---------"+ rdd)
if(rdd.isEmpty){
log.info("null ")
}
else{
log.info("not null..")
}
log.info("complete")
})
})
})
ssc.start()
ssc.awaitTermination()
}
------------------------这里例外--------------------- -----------
19/07/26 18:21:56 WARN Scheduler.TaskSetManager:丢失任务0.0 in 阶段0.0(TID 0,cdh102,执行程序2):java.lang.NullPointerException 在Recommend.DealLog $$ anonfun $ 2 $$ anonfun $ apply $ 1.apply(DealLog.scala:42) 在Recommend.DealLog $$ anonfun $ 2 $$ anonfun $ apply $ 1.apply(DealLog.scala:41) 在org.apache.spark.rdd.RDD $$ anonfun $ foreachPartition $ 1 $ anonfun $ apply $ 29.apply(RDD.scala:926) 在org.apache.spark.rdd.RDD $$ anonfun $ foreachPartition $ 1 $ anonfun $ apply $ 29.apply(RDD.scala:926) 在org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:2071) 在org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:2071) 在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 在org.apache.spark.scheduler.Task.run(Task.scala:109) 在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:338) 在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:617) 在java.lang.Thread.run(Thread.java:745)
19/07/26 18:21:56 INFO scheduler.TaskSetManager: Starting task 0.1 in stage 0.0 (TID 1, cdh102, executor 2, partition 0, PROCESS_LOCAL,
4706字节) 19/07/26 18:21:56 INFO scheduler.TaskSetManager:在执行器2的cdh102上的阶段0.0(TID 1)中丢失了任务0.1 java.lang.NullPointerException(空)[重复1] 19/07/26 18:21:56 INFO scheduler.TaskSetManager:在阶段0.0中启动任务0.2(TID 2,cdh102,执行程序2,分区0,PROCESS_LOCAL, 4706字节) 19/07/26 18:21:56 INFO scheduler.TaskSetManager:在执行器2的cdh102上的阶段0.0(TID 2)中丢失了任务0.2 java.lang.NullPointerException(空)[重复2] 19/07/26 18:21:56 INFO scheduler.TaskSetManager:在阶段0.0中启动任务0.3(TID 3,cdh102,执行程序2,分区0,PROCESS_LOCAL, 4706字节) 19/07/26 18:21:56 INFO scheduler.TaskSetManager:在执行器2的cdh102上的阶段0.0(TID 3)中丢失了任务0.3 java.lang.NullPointerException(空)[重复3] 19/07/26 18:21:56 ERROR Scheduler.TaskSetManager:阶段0中的任务0失败4次;放弃工作 19/07/26 18:21:56 INFO cluster.YarnClusterScheduler:从池中删除了任务已全部完成的TaskSet 0.0 19/07/26 18:21:56 INFO cluster.YarnClusterScheduler:取消阶段0 19/07/26 18:21:56 INFO scheduler.DAGScheduler:ResultStage 0(DealLog.scala:41处的foreachPartition)在1.092 s内由于作业失败 由于阶段失败而中止:阶段0.0中的任务0失败4次,大多数 最近的失败:在阶段0.0中丢失了任务0.3(TID 3,cdh102,执行程序 2):java.lang.NullPointerException 在Recommend.DealLog $$ anonfun $ 2 $$ anonfun $ apply $ 1.apply(DealLog.scala:42) 在Recommend.DealLog $$ anonfun $ 2 $$ anonfun $ apply $ 1.apply(DealLog.scala:41) 在org.apache.spark.rdd.RDD $$ anonfun $ foreachPartition $ 1 $ anonfun $ apply $ 29.apply(RDD.scala:926) 在org.apache.spark.rdd.RDD $$ anonfun $ foreachPartition $ 1 $ anonfun $ apply $ 29.apply(RDD.scala:926) 在org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:2071) 在org.apache.spark.SparkContext $$ anonfun $ runJob $ 5.apply(SparkContext.scala:2071) 在org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) 在org.apache.spark.scheduler.Task.run(Task.scala:109) 在org.apache.spark.executor.Executor $ TaskRunner.run(Executor.scala:338) 在java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 在java.util.concurrent.ThreadPoolExecutor $ Worker.run(ThreadPoolExecutor.java:617) 在java.lang.Thread.run(Thread.java:745)
答案 0 :(得分:0)
我认为您的问题可能来自此行
if(rdd.isEmpty)
因为您编写代码的方式实际上不是RDD。调用foreachPartition之后,您将获得迭代器到该分区。当您在该迭代器上调用foreach时,您将访问该分区迭代器上的实际记录。因此,您正在处理的是来自DStream的记录。因此,可能您可能在引发该异常的空字符串/值上调用.isEmpty
。
您可以用
代替.isEmpty
if(record == null)
但是您不必这样做。您可以只检查RDD本身是否为空。您可以尝试以下方法吗?
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.Logger
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DealLog extends App {
val spark = SparkSession.builder().appName(" DealLog").getOrCreate()
val sc = spark.sparkContext
val ssc: StreamingContext = new StreamingContext(sc, Seconds(3))
val log = Logger.getLogger(this.getClass)
val pro = new Properties()
val in = Thread.currentThread().getContextClassLoader.getResourceAsStream("config.properties")
pro.load(in)
// ssc.checkpoint("hdfs://192.168.0.240:8022/bigdata/checkpoint2")
val bootstrap = pro.getProperty("kafka.brokers")
val kafkaParams = Map[String, Object]("bootstrap.servers" -> bootstrap,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "userlabel",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean)
)
val topicsSet = Array(pro.getProperty("kafkaconsume.topic"))
val ds = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topicsSet, kafkaParams)
).map(s => {
(s.value())
})
ds.foreachRDD(rdd => {
log.info("ds.foreachRdd p==" + rdd)
if (!rdd.isEmpty) {
rdd.foreachPartition(partition => {
log.info("per-------" + partition)
partition.foreach(record => {
log.info("record---------" + record)
})
})
} else log.info("rdd was empty")
log.info("complete")
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}