使用HBase Sink的Spark结构化流

时间:2020-07-02 06:50:09

标签: scala spark-streaming spark-structured-streaming

我的用例是读取具有结构化流的Kafka消息,并使用foreachBatch通过使用一些大容量Put将这些消息推入HBase,从而获得优于单个Put的性能,我能够使用foreach推消息(感谢{{ }}),但无法对foreachBatch操作执行相同操作。

有人可以帮忙吗?附加下面的代码。

KafkaStructured.scala:

Activity

HBaseBulkForeachWriter.scala:


package com.test

import java.math.BigInteger
import java.util

import com.fasterxml.jackson.annotation.JsonIgnoreProperties
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql._
import org.apache.spark.sql.functions._


object KafkaStructured {

  @JsonIgnoreProperties(ignoreUnknown = true)
  case class Header(field1: String, field2: String, field3: String)

  @JsonIgnoreProperties(ignoreUnknown = true)
  case class Body(fieldx: String)

  @JsonIgnoreProperties(ignoreUnknown = true)
  case class Event(header: Header, body: Body)

  @JsonIgnoreProperties(ignoreUnknown = true)
  case class KafkaResp(event: Event)

  @JsonIgnoreProperties(ignoreUnknown = true)
  case class HBaseDF(field1: String, field2: String, field3: String)


  def main(args: Array[String]): Unit = {

    val jsonSchema = Encoders.product[KafkaResp].schema

    val spark = SparkSession
      .builder()
      .appName("Kafka Spark")
      .getOrCreate()

    val df = spark
      .readStream
      .format("kafka")
      .option...
      .load()

    import spark.sqlContext.implicits._

    val flattenedDf: DataFrame =
      df
        .select($"value".cast("string").as("json"))
        .select(from_json($"json", jsonSchema).as("data"))
        .select("data.event.header.field1", "data.event.header.field2", "data.event.header.field3")

    val hbaseDf = flattenedDf
      .as[HBaseDF]
      .filter(hbasedf => hbasedf != null && hbasedf.field1 != null)

    flattenedDf
      .writeStream
      .option("truncate", "false")
      .option("checkpointLocation", "some hdfs location")
      .format("console")
      .outputMode("append")
      .start()

    def bytes(data: String) = {
      val bytes = data match {
        case data if data != null && !data.isEmpty => Bytes.toBytes(data)
        case _ => Bytes.toBytes("")
      }
      bytes
    }

   
    hbaseDf
      .writeStream
      .foreachBatch(function = (batchDf, batchId) => {
        val putList = new util.ArrayList[Put]()
        batchDf
          .foreach(row => {
            val p: Put = new Put(bytes(row.field1))
            val cfName= bytes("fam1")
            p.addColumn(cfName, bytes("field1"), bytes(row.field1))
            p.addColumn(cfName, bytes("field2"), bytes(row.field2))
            p.addColumn(cfName, bytes("field3"), bytes(row.field3))
            putList.add(p)
          })
        new HBaseBulkForeachWriter[HBaseDF] {
          override val tableName: String = "<my table name>"
        
          override def bulkPut: util.ArrayList[Put] = {
            putList
          }
        }
      }
      )
      .start()

    spark.streams.awaitAnyTermination()
  }

}

1 个答案:

答案 0 :(得分:1)

foreachBatch 允许您在函数内部使用 foreachPartition。 在 foreachPartition 中执行的代码每个执行器只运行一次。

所以你可以创建一个函数来创建一个put:

def putValue(key: String, columnName: String, data: Array[Byte]): Put = {
    val put = new Put(Bytes.toBytes(key))
    put.addColumn(Bytes.toBytes("colFamily"), Bytes.toBytes(columnName), data)
  }

然后是批量插入 puts 的函数

def writePutList(putList: List[Put]): Unit = {
    val config: Configuration = HBaseConfiguration.create()
    config.set("hbase.zookeeper.quorum", zookeperUrl)

    val connection: Connection = ConnectionFactory.createConnection(config)
    val table = connection.getTable(TableName.valueOf(tableName))
    table.put(putList.asJava)
    logger.info("INSERT record[s] " + putList.size + " to table " + tableName + " OK.")
    table.close()
    connection.close()
  }
   

并在 foreachPartitionmap 中使用它们

 def writeFunction: (DataFrame, Long) => Unit = {
    (batchData, id) => {
      batchData.foreachPartition(
        partition => {  
          val putList = partition.map(
            data =>
             putValue(data.getAs[String]("keyField"), "colName", Bytes.toBytes(data.getAs[String]("valueField")))
          ).toList
         writePutList(putList)
        }
      )
    }
  }

最后使用在您的流式查询中创建的函数:

 df.writeStream
      .queryName("yourQueryName")
      .option("checkpointLocation", checkpointLocation)
      .outputMode(OutputMode.Update())
      .foreachBatch(writeFunction)
      .start()
      .awaitTermination()