我正在使用Kafka进行数据传输,Spark流式传输来接收数据,并使用SHC(Spark-Hbase Connector / Hortonworks)将数据保存到HBase。
但是我在保存数据时遇到问题。保存数据会浪费大量时间。因此问题出在缓冲上。
这是我的代码。
import kafka.serializer.StringDecoder
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.datasources.hbase._
import org.apache.spark.{SparkConf, SparkContext}
import spark.sqlContext.implicits._
import java.time.{Instant, ZoneId, ZonedDateTime}
import java.time.format.{ DateTimeFormatter, DateTimeParseException }
val ssc = new StreamingContext(sc, Seconds(1))
val topics = "sensor_rpi"
val topicsSet = topics.split(",").toSet
val brokers = "sdc4:9092,sdc5:9092,sdc6:9092,sdc7:9092"
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder,
StringDecoder](
ssc, kafkaParams, topicsSet).map(_._2)
val words = messages.flatMap(_.split("[=]+"))
def catalog = s"""{
"table":{"namespace":"default", "name":"sensor_rpi"},
"rowkey":"key",
"columns":{
"rowkey":{"cf":"rowkey", "col":"key", "type":"string"},
"X":{"cf":"DATA", "col":"X", "type":"int"},
"Y":{"cf":"DATA", "col":"Y", "type":"int"},
"Z":{"cf":"DATA", "col":"Z", "type":"int"}
}
}""".stripMargin
case class Sensor(
rowkey:String,
X:Int,
Y:Int,
Z:Int
) extends Serializable
var sensor=words.map{line=>
val str = line.split('|')
val dateTime = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss")
Sensor(str(3)+"."+str(4),str(0).toInt,str(1).toInt,str(2).toInt)
}
sensor.foreachRDD{rdd=>
sc.parallelize(rdd.collect())
.toDF.write.options(Map(HBaseTableCatalog.tableCatalog ->
catalog,HBaseTableCatalog.newTable-
>"5")).format("org.apache.spark.sql.execution.datasources.hbase").save()
}
ssc.start()
当我删除foreachRDD函数时,输入(kafka)和输出(Hbase)之间没有任何延迟时间。我不知道该怎么办...
请给我一些提示...