Kafka Lag使用Spark Application的使用量并未减少

时间:2019-01-21 12:29:45

标签: apache-spark apache-kafka kafka-consumer-api spark-cassandra-connector

我正在尝试使用1节点spark和1节点Cassandra将数据从kafka插入Cassandra。我的代码片段如下:-

import com.datastax.spark.connector.writer._ 

import java.util.Calendar
import scala.collection.mutable.ListBuffer
import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka.KafkaUtils
import com.datastax.spark.connector.SomeColumns
import com.datastax.spark.connector.toRDDFunctions
import com.datastax.spark.connector.streaming._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql._
import kafka.serializer.StringDecoder
import net.liftweb.json.DefaultFormats
import net.liftweb.json.parse
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.ForeachWriter
import com.datastax.driver.core.Cluster
import axestrack.bean.{ event, liveevent }
import java.time.format.{ DateTimeFormatter, DateTimeParseException }
import java.time.{ Instant, LocalDate, ZoneId, ZonedDateTime }

object liveData extends Serializable {

  private var dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
  private var dateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")

  implicit val formats = DefaultFormats

  def parser(json: String): event = {

    val fleetrecord = parse(json).extract[liveevent]

    return event(
      fleetrecord.id, dateFormatter.format(Instant.ofEpochMilli(fleetrecord.evT).atZone(ZoneId.of("UTC"))),
      dateTimeFormatter.format(Instant.ofEpochMilli(fleetrecord.evT).atZone(ZoneId.of("UTC"))),
      dateTimeFormatter.format(Instant.ofEpochMilli(System.currentTimeMillis()).atZone(ZoneId.of("UTC"))),
      DigestUtils.md5Hex(fleetrecord.id + fleetrecord.evT).toUpperCase(),
      dateTimeFormatter.format(Instant.ofEpochMilli(fleetrecord.lstevT).atZone(ZoneId.of("UTC"))),
      BigDecimal(fleetrecord.lT).setScale(6, BigDecimal.RoundingMode.HALF_UP).toDouble,
      BigDecimal(fleetrecord.lN).setScale(6, BigDecimal.RoundingMode.HALF_UP).toDouble,
      fleetrecord.d, fleetrecord.dFrmD, fleetrecord.s, fleetrecord.agl, 0, fleetrecord.port,
      Map("d1" -> fleetrecord.d1, "d2" -> fleetrecord.d2, "d3" -> fleetrecord.d3, "d4" -> fleetrecord.d4,
        "ebt" -> fleetrecord.eBt, "ibt" -> fleetrecord.iBt, "a1" -> fleetrecord.a1, "a2" -> fleetrecord.a2,
        "a3" -> fleetrecord.a3, "a4" -> fleetrecord.a4), fleetrecord.evT)
  }

  def main(args: Array[String]) {

    val conf = new SparkConf()
      //.setMaster("local[3]")
      .setAppName("Fleet Live Data")
      .set("spark.cassandra.connection.host", "ip")
      .set("spark.cassandra.connection.keep_alive_ms", "60000")
      .set("spark.cassandra.auth.username", "user")
      .set("spark.cassandra.auth.password", "pass")
      .set("spark.executor.memory", "2g")
      .set("spark.driver.memory", "2g")
      .set("spark.submit.deployMode", "cluster")
      .set("spark.executor.instances", "4")
      .set("spark.executor.cores", "1")
      .set("spark.cores.max", "9")
      .set("spark.driver.cores", "9")
      //   .set("spark.cassandra.input.split.size_in_mb", "67108864")
      //  .set("spark.streaming.backpressure.enabled", "true")
      .set("spark.speculation", "true")
      .set("spark.locality.wait", "2s")

    println("Spark Configuration Done")
    val spark = SparkSession
      .builder
      .appName("Fleet Live Data")
      .config(conf)
      .getOrCreate()
    println("Spark Session Config Done")

    val sc = SparkContext.getOrCreate(conf)
    sc.setLogLevel("ERROR")
    val ssc = new StreamingContext(sc, Seconds(10))
    val topics = Map(args(0) -> 1)
    val kafkaParams = Map[String, String](
      "zookeeper.connect" -> args(1),
      "group.id" -> args(2),
      "auto.offset.reset" -> "largest")

    import spark.implicits._
    val kafkaStream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER)
    val collection = kafkaStream.map(_._2).map(parser)

    collection.saveToCassandra("trackfleet_db", "locationinfo")
    collection.saveToCassandra("trackfleet_db", "locationinfo_recent", writeConf = WriteConf(timestamp = TimestampOption.perRow("gpsdtt")))
    println("Batch Started at " + dateTimeFormatter.format(Instant.ofEpochMilli(System.currentTimeMillis()).atZone(ZoneId.of("UTC"))))
    ssc.start()
    ssc.awaitTermination()
  }
}

正在消耗数据,此代码运行10秒,每10秒消耗约3K条记录。但是我在卡夫卡排队的滞后持续增加吗?可能是什么问题?它很奇怪和奇怪。需要一些帮助。谢谢,

0 个答案:

没有答案