我有一个简单的火花流工作。它从Kafka主题中读取事件,进行简单的事件转换(例如,将一些字符替换为另一个字符)并将转换后的事件发送到第二个Kafka主题。一切都运行良好一段时间(1 - 1.5小时),之后我们看到批次安排(见下面的屏幕)并等待运行。暂停大约需要5-6分钟,这次GC正在工作并清理内存。之后一切正常,但有时处理停止,在日志中我们会看到类似的错误(请参阅下面的堆栈跟踪)。请告知应该设置哪些Spark / Java参数以避免此GC开销。 Spark任务每10秒安排一次,一次执行大约需要5秒。
堆栈跟踪
2017-09-21 11:26:15 WARN TaskSetManager:66 - Lost task 33.0 in stage 115.0 (TID 4699, work8, executor 6): java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.kafka.clients.consumer.internals.Fetcher.createFetchRequests(Fetcher.java:724)
at org.apache.kafka.clients.consumer.internals.Fetcher.sendFetches(Fetcher.java:176)
at org.apache.kafka.clients.consumer.KafkaConsumer.pollOnce(KafkaConsumer.java:1042)
at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:995)
at org.apache.spark.streaming.kafka010.CachedKafkaConsumer.poll(CachedKafkaConsumer.scala:99)
at org.apache.spark.streaming.kafka010.CachedKafkaConsumer.get(CachedKafkaConsumer.scala:70)
at org.apache.spark.streaming.kafka010.KafkaRDD$KafkaRDDIterator.next(KafkaRDD.scala:228)
at org.apache.spark.streaming.kafka010.KafkaRDD$KafkaRDDIterator.next(KafkaRDD.scala:194)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1951)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1951)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
2017-09-21 11:26:15 INFO TaskSetManager:54 - Lost task 37.0 in stage 115.0 (TID 4702) on work8, executor 6: java.lang.OutOfMemoryError (GC overhead limit exceeded) [duplicate 1]
2017-09-21 11:26:15 INFO TaskSetManager:54 - Lost task 26.0 in stage 115.0 (TID 4695) on work8, executor 6: java.lang.OutOfMemoryError (GC overhead limit exceeded) [duplicate 2]
spark的参数 - 提交
spark-2.1.1-bin-hadoop2.6/bin/spark-submit \
--master yarn \
--deploy-mode client \
--executor-cores 8 \
--executor-memory 20g \
--driver-memory 20g \
--num-executors 4 \
--conf "spark.driver.maxResultSize=8g" \
--conf "spark.streaming.receiver.maxRate=1125" \
--conf "spark.streaming.kafka.maxRatePerPartition=1125" \
//Job
val sendToKafka = KafkaSender.sendToKafka(spark, kafkaServers, outputTopic, kafkaEnabled) _
val stream = KafkaUtils
.createDirectStream(ssc, PreferConsistent, Subscribe[String, String](inputTopics, kafkaParams))
stream.foreachRDD { statementsStreamBatch =>
val offsetRanges = statementsStreamBatch.asInstanceOf[HasOffsetRanges].offsetRanges
if (!statementsStreamBatch.isEmpty) {
val inputCsvRDD = statementsStreamBatch.map(_.value)
var outputCsvRDD : RDD[String] = null
if(enrichmerEnabled) {
outputCsvRDD = Enricher.processStreaminputCsvRDD, enricherNumberOfFields)
} else {
outputCsvRDD = inputCsvRDD
}
sendToKafka(outputCsvRDD)
}
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
ssc.start()
ssc.awaitTermination()
//Enricher
object Enricher {
def processStream(eventStream: RDD[String], numberOfFields : Integer): RDD[String] = {
eventStream.map(
csv => if (csv.count(_ == ',') <= numberOfFields) {
csv
} else {
csv.replaceAll(",(?=[^']*',)", "#")
}
)
}
//KafkaSender
object KafkaSender {
def sendToKafka(spark: SparkSession, servers: String, topic: String, enabled: Boolean)(message: RDD[String]): Unit = {
val kafkaSink = spark.sparkContext.broadcast(KafkaSink(getKafkaProperties(servers)))
val kafkaTopic = spark.sparkContext.broadcast(topic)
message.foreach(kafkaSink.value.send(kafkaTopic.value, _))
}
}