来自Kafka的Spark Cassandra使用Scala表现不佳

时间:2018-05-16 13:56:39

标签: scala apache-spark spark-streaming spark-cassandra-connector

我有以下代码:

import java.text.SimpleDateFormat
import java.util.Calendar
import scala.collection.mutable.ListBuffer
import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka.KafkaUtils
import com.datastax.spark.connector.SomeColumns
import com.datastax.spark.connector.toRDDFunctions
import com.datastax.spark.connector.streaming._
import com.redis.RedisClientPool
import kafka.serializer.StringDecoder
import net.liftweb.json.DefaultFormats
import net.liftweb.json.parse
import org.apache.spark.util.SizeEstimator
case class event(imei: String, date: String, gpsdt: String, dt: String,id: String)
case class historyevent(imei: String, date: String, gpsdt: String)
object historyData extends Serializable {

  def main(args: Array[String]) {

val clients = new RedisClientPool("192.168.0.40", 6379)
val conf = new SparkConf()
  .setAppName("Fleet History Data")
  .set("spark.cassandra.connection.host", "192.168.0.40")
  .set("spark.cassandra.connection.keep_alive_ms", "20000")
  .set("spark.executor.memory", "800M")
  .set("spark.driver.memory", "800M")
  .set("spark.submit.deployMode", "cluster")
  .set("spark.executor.instances", "4")
  .set("spark.executor.cores", "1")
  .set("spark.cores.max", "4")
  .set("spark.driver.cores", "1")
  .set("spark.streaming.backpressure.enabled", "true")

val sc = SparkContext.getOrCreate(conf)
val ssc = new StreamingContext(sc, Seconds(5))

val topics = Map(ArrowAssoc("topicname") -> 1)

val kafkaStream = KafkaUtils.createStream(ssc, "192.168.0.40:2181", "groupname", topics)

kafkaStream.foreachRDD { rdd =>
  val updatedRDD =
    rdd.map(a =>
      {
        implicit val formats = DefaultFormats
        val jValue = parse(a._2)
        //  println("value " + jValue.toString())
        val fleetrecord = jValue.extract[historyevent]
        val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
        val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
        val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())

        event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)

      })
      .collect()

  refArrayOps(updatedRDD).foreach(f =>
    {
      clients.withClient {
        client =>
          {
            val value = f.imei + " , " + f.gpsdt
            val zscore = Calendar.getInstance().getTimeInMillis
            val key = new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime())
            val dt = new SimpleDateFormat("HH:mm:ss").format(Calendar.getInstance().getTime())
            val q1 = "00:00:00"
            val q2 = "06:00:00"
            val q3 = "12:00:00"
            val q4 = "18:00:00"
            val quater = if (augmentString(dt) > q1 && augmentString(dt) < q2) {
              " -> 1"
            } else if (augmentString(dt) > q2 && augmentString(dt) < q3) {
              " -> 2"
            } else if (augmentString(dt) > q3 && augmentString(dt) < q4) {
              " -> 3"
            } else {
              " -> 4"
            }
            client.zadd(key + quater, zscore, value)
          }
      }
    })

  val part = (refArrayOps(updatedRDD).size + 470 - 1) / 470
  val collection = if (part > 0) {
    sc.parallelize(wrapRefArray(updatedRDD), part)
  } else { sc.parallelize(wrapRefArray(updatedRDD)) } 
toRDDFunctions(collection).saveToCassandra("db", "table")

}
ssc.start()
ssc.awaitTermination()
  }}

我的硬件规格:

  • 4节点Cassandra集群

我正在从Kafka获取一个JSON字符串并将其插入到Redis和Cassandra中,但这表现得非常糟糕。插入时每千个条目大约需要1-2秒。例如,对于20K条目,Spark UI显示大约需要18-22秒。可以做些什么来调整我的表现?

0 个答案:

没有答案