Question

我想使用HBase从Spark Streaming表中检索数据。我有一个HBase表，该表将经常填充记录。记录将插入列process_flag='N'中。我想运行一个火花流作业，该作业将连续运行以使用process_flag'='N'检索记录。处理完成后，process_flag将更新为Y，以便在下一次运行时仅提取新记录。

我试图使用Spark Streaming创建自定义接收器。但是我收到一个例外

object GenerateIDDriver {

    val spark = SparkSession.builder
        .config("spark.streaming.stopGracefullyOnShutdown", "true")
        .enableHiveSupport()
        .getOrCreate()

    //Create the sparkContext
    val sc = spark.sparkContext

    def main(args: Array[String]) {

        //Batch interval for kafka streaming and creating streaming context object
        val batchInterval: org.apache.spark.streaming.Duration = Seconds(1)
        val ssc = new StreamingContext(sc, batchInterval)
        val customReceiverStream = ssc.receiverStream(new CustomReceiver(spark, sc, "<HBase_Table_Name>", "<HBase_Column_Family>"))
        customReceiverStream.foreachRDD({ rdd =>

          // Get the singleton instance of SparkSession
          val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
          import spark.implicits._
          val dataDF = rdd.toDF()
          dataDF.show()

        })
    }

}



class CustomReceiver(spark: SparkSession, sc: SparkContext, hbaseTableName: String, familyName: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging{

  def onStart() {
    // Start the thread that receives data over a connection
    new Thread("HBase Data Receiver") {
      override def run() { receive() }
    }.start()
  }

  def onStop() {
    // There is nothing much to do as the thread calling receive()
    // is designed to stop by itself isStopped() returns false
  }

  //
  private def receive() {

    try {

      //HBase Config File
      val hbaseConfigFile = "location of hbase-site.xml"

      val hb = HBaseConfiguration.create()
      hb.addResource(hbaseConfigFile)
      val admin = new HBaseAdmin(hb)

      val flagValue = "N"
      while (!isStopped) {
        val resultDF = HBaseUtility.retrieveRecordsFromHBase(hb, admin, sc, spark, hbaseTableName, familyName, readProp, flagValue)
        resultDF.show(false)
        store(resultDF.toString())
      }

      // Restart in an attempt to connect again when server is active again
      restart("Trying to connect again")
    } catch {
      case e: java.net.ConnectException =>
        // restart if could not connect to server
        restart("Error connecting to HBase Server :", e)
      case t: Throwable =>
        // restart if there is any other error
        restart("Error receiving data", t)
    }
  }
}


Object HBaseUtility extends Serializable{

    def retrieveRecordsFromHBase(hb:Configuration,admin:HBaseAdmin,sc: SparkContext, spark:SparkSession, hbaseTableName: String, familyName: String, readProp: Properties,flagValue:String): DataFrame = {


    val processFlagCol = "process_flag"

    import spark.implicits._
    var stageDF: DataFrame = spark.emptyDataFrame

    if (admin.tableExists(hbaseTableName)) {
      hb.set(TableInputFormat.INPUT_TABLE, hbaseTableName)

      hb.set(TableInputFormat.SCAN_COLUMNS, familyName) // scan data column family

      val scan: Scan = new Scan()
      val filter: SingleColumnValueFilter = new SingleColumnValueFilter(Bytes.toBytes(familyName), Bytes.toBytes(processFlagCol), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(flagValue)))
      scan.setFilter(filter)
      hb.set(TableInputFormat.SCAN, convertScanToString(scan))

      val hBaseRDD = sc.newAPIHadoopRDD(hb, classOf[TableInputFormat],
        classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
        classOf[org.apache.hadoop.hbase.client.Result]) // Load an RDD of row key, result(ImmutableBytesWritable, Result) tuples from the table

      val resultRDD = hBaseRDD.map(x => x._2) // transform (ImmutableBytesWritable, Result) tuples into an RDD of Results

      // transform RDD of Results into an RDD of StageDataRow objects
      val stageRDD = resultRDD.map(res => StageDataRow.parseStageHbaseRow(res, familyName, readProp))

      stageDF = stageRDD.toDF() // change RDD of StageDataRow objects to a DataFrame

    }

    stageDF
  }

   /**
   * To encode the Scan Object and return the String
   * @param scan
   * @return
   */
  def convertScanToString(scan: Scan): String = {
    val proto: org.apache.hadoop.hbase.protobuf.generated.ClientProtos.Scan = ProtobufUtil.toScan(scan)
    Base64.encodeBytes(proto.toByteArray())
  }

  /**
   * @param sc
   * @param sqlContext
   * @param hb
   * @param admin
   * @param hbaseTableName
   * @param familyName
   * @param rowKey
   * @param readProp
   * @return
   */

  case class StageDataRow(
    rowkey:        String,
    column1:          String,
    created_by:    String,
    created_date:  String,
    processFlag:  String,
    modified_by:   String,
    modified_date: String,
    surrogate_key: String)

  object StageDataRow extends Serializable {
    /**
     * @param result
     * @param columnFamily
     * @param readProp
     * @return
     */
    def parseStageHbaseRow(result: Result, columnFamily: String, readProp: Properties): StageDataRow = {

      val rowkey = Bytes.toString(result.getRow())
      val cfDataBytes = Bytes.toBytes(columnFamily)

      //val p0 = rowkey.split(" ")(0) // remove time from rowKey, stats row key is for day
      val p0 = rowkey
      val p1 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes("column1")))
      val p2 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(createdBy)))
      val p3 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(createdDate)))
      val p4 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(processFlag)))
      val p5 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(modifiedBy)))
      val p6 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(modifiedDate)))
      val p7 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(surrogate_key)))

      StageDataRow(p0, p1, p2, p3, p4, p5, p6, p7)
    }
  }
}

我在HBaseUtility Scala对象的以下代码中收到一个NullPointerException：

var stageDF: DataFrame = spark.emptyDataFrame

2019-08-07 12:12:10错误ReceiverTracker：70-流0的注销接收器：延迟2000ms重启接收器：接收数据时出错-java.lang.NullPointerException 在org.apache.spark.sql.SparkSession.emptyDataFrame $ lzycompute（SparkSession.scala：265）在org.apache.spark.sql.SparkSession.emptyDataFrame（SparkSession.scala：264）

无法使用Spark Streaming通过自定义接收器从HBase获取数据

0 个答案: