我正在尝试使用1节点spark和1节点Cassandra将数据从kafka插入Cassandra。我的代码片段如下:-
import com.datastax.spark.connector.writer._
import java.util.Calendar
import scala.collection.mutable.ListBuffer
import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka.KafkaUtils
import com.datastax.spark.connector.SomeColumns
import com.datastax.spark.connector.toRDDFunctions
import com.datastax.spark.connector.streaming._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql._
import kafka.serializer.StringDecoder
import net.liftweb.json.DefaultFormats
import net.liftweb.json.parse
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.ForeachWriter
import com.datastax.driver.core.Cluster
import axestrack.bean.{ event, liveevent }
import java.time.format.{ DateTimeFormatter, DateTimeParseException }
import java.time.{ Instant, LocalDate, ZoneId, ZonedDateTime }
object liveData extends Serializable {
private var dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
private var dateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
implicit val formats = DefaultFormats
def parser(json: String): event = {
val fleetrecord = parse(json).extract[liveevent]
return event(
fleetrecord.id, dateFormatter.format(Instant.ofEpochMilli(fleetrecord.evT).atZone(ZoneId.of("UTC"))),
dateTimeFormatter.format(Instant.ofEpochMilli(fleetrecord.evT).atZone(ZoneId.of("UTC"))),
dateTimeFormatter.format(Instant.ofEpochMilli(System.currentTimeMillis()).atZone(ZoneId.of("UTC"))),
DigestUtils.md5Hex(fleetrecord.id + fleetrecord.evT).toUpperCase(),
dateTimeFormatter.format(Instant.ofEpochMilli(fleetrecord.lstevT).atZone(ZoneId.of("UTC"))),
BigDecimal(fleetrecord.lT).setScale(6, BigDecimal.RoundingMode.HALF_UP).toDouble,
BigDecimal(fleetrecord.lN).setScale(6, BigDecimal.RoundingMode.HALF_UP).toDouble,
fleetrecord.d, fleetrecord.dFrmD, fleetrecord.s, fleetrecord.agl, 0, fleetrecord.port,
Map("d1" -> fleetrecord.d1, "d2" -> fleetrecord.d2, "d3" -> fleetrecord.d3, "d4" -> fleetrecord.d4,
"ebt" -> fleetrecord.eBt, "ibt" -> fleetrecord.iBt, "a1" -> fleetrecord.a1, "a2" -> fleetrecord.a2,
"a3" -> fleetrecord.a3, "a4" -> fleetrecord.a4), fleetrecord.evT)
}
def main(args: Array[String]) {
val conf = new SparkConf()
//.setMaster("local[3]")
.setAppName("Fleet Live Data")
.set("spark.cassandra.connection.host", "ip")
.set("spark.cassandra.connection.keep_alive_ms", "60000")
.set("spark.cassandra.auth.username", "user")
.set("spark.cassandra.auth.password", "pass")
.set("spark.executor.memory", "2g")
.set("spark.driver.memory", "2g")
.set("spark.submit.deployMode", "cluster")
.set("spark.executor.instances", "4")
.set("spark.executor.cores", "1")
.set("spark.cores.max", "9")
.set("spark.driver.cores", "9")
// .set("spark.cassandra.input.split.size_in_mb", "67108864")
// .set("spark.streaming.backpressure.enabled", "true")
.set("spark.speculation", "true")
.set("spark.locality.wait", "2s")
println("Spark Configuration Done")
val spark = SparkSession
.builder
.appName("Fleet Live Data")
.config(conf)
.getOrCreate()
println("Spark Session Config Done")
val sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")
val ssc = new StreamingContext(sc, Seconds(10))
val topics = Map(args(0) -> 1)
val kafkaParams = Map[String, String](
"zookeeper.connect" -> args(1),
"group.id" -> args(2),
"auto.offset.reset" -> "largest")
import spark.implicits._
val kafkaStream = KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER)
val collection = kafkaStream.map(_._2).map(parser)
collection.saveToCassandra("trackfleet_db", "locationinfo")
collection.saveToCassandra("trackfleet_db", "locationinfo_recent", writeConf = WriteConf(timestamp = TimestampOption.perRow("gpsdtt")))
println("Batch Started at " + dateTimeFormatter.format(Instant.ofEpochMilli(System.currentTimeMillis()).atZone(ZoneId.of("UTC"))))
ssc.start()
ssc.awaitTermination()
}
}
正在消耗数据,此代码运行10秒,每10秒消耗约3K条记录。但是我在卡夫卡排队的滞后持续增加吗?可能是什么问题?它很奇怪和奇怪。需要一些帮助。谢谢,