我想使用HBase
从Spark Streaming
表中检索数据。我有一个HBase表,该表将经常填充记录。记录将插入列process_flag='N'
中。我想运行一个火花流作业,该作业将连续运行以使用process_flag'='N'
检索记录。处理完成后,process_flag
将更新为Y
,以便在下一次运行时仅提取新记录。
我试图使用Spark Streaming创建自定义接收器。但是我收到一个例外
object GenerateIDDriver {
val spark = SparkSession.builder
.config("spark.streaming.stopGracefullyOnShutdown", "true")
.enableHiveSupport()
.getOrCreate()
//Create the sparkContext
val sc = spark.sparkContext
def main(args: Array[String]) {
//Batch interval for kafka streaming and creating streaming context object
val batchInterval: org.apache.spark.streaming.Duration = Seconds(1)
val ssc = new StreamingContext(sc, batchInterval)
val customReceiverStream = ssc.receiverStream(new CustomReceiver(spark, sc, "<HBase_Table_Name>", "<HBase_Column_Family>"))
customReceiverStream.foreachRDD({ rdd =>
// Get the singleton instance of SparkSession
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val dataDF = rdd.toDF()
dataDF.show()
})
}
}
class CustomReceiver(spark: SparkSession, sc: SparkContext, hbaseTableName: String, familyName: String) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) with Logging{
def onStart() {
// Start the thread that receives data over a connection
new Thread("HBase Data Receiver") {
override def run() { receive() }
}.start()
}
def onStop() {
// There is nothing much to do as the thread calling receive()
// is designed to stop by itself isStopped() returns false
}
//
private def receive() {
try {
//HBase Config File
val hbaseConfigFile = "location of hbase-site.xml"
val hb = HBaseConfiguration.create()
hb.addResource(hbaseConfigFile)
val admin = new HBaseAdmin(hb)
val flagValue = "N"
while (!isStopped) {
val resultDF = HBaseUtility.retrieveRecordsFromHBase(hb, admin, sc, spark, hbaseTableName, familyName, readProp, flagValue)
resultDF.show(false)
store(resultDF.toString())
}
// Restart in an attempt to connect again when server is active again
restart("Trying to connect again")
} catch {
case e: java.net.ConnectException =>
// restart if could not connect to server
restart("Error connecting to HBase Server :", e)
case t: Throwable =>
// restart if there is any other error
restart("Error receiving data", t)
}
}
}
Object HBaseUtility extends Serializable{
def retrieveRecordsFromHBase(hb:Configuration,admin:HBaseAdmin,sc: SparkContext, spark:SparkSession, hbaseTableName: String, familyName: String, readProp: Properties,flagValue:String): DataFrame = {
val processFlagCol = "process_flag"
import spark.implicits._
var stageDF: DataFrame = spark.emptyDataFrame
if (admin.tableExists(hbaseTableName)) {
hb.set(TableInputFormat.INPUT_TABLE, hbaseTableName)
hb.set(TableInputFormat.SCAN_COLUMNS, familyName) // scan data column family
val scan: Scan = new Scan()
val filter: SingleColumnValueFilter = new SingleColumnValueFilter(Bytes.toBytes(familyName), Bytes.toBytes(processFlagCol), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(flagValue)))
scan.setFilter(filter)
hb.set(TableInputFormat.SCAN, convertScanToString(scan))
val hBaseRDD = sc.newAPIHadoopRDD(hb, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result]) // Load an RDD of row key, result(ImmutableBytesWritable, Result) tuples from the table
val resultRDD = hBaseRDD.map(x => x._2) // transform (ImmutableBytesWritable, Result) tuples into an RDD of Results
// transform RDD of Results into an RDD of StageDataRow objects
val stageRDD = resultRDD.map(res => StageDataRow.parseStageHbaseRow(res, familyName, readProp))
stageDF = stageRDD.toDF() // change RDD of StageDataRow objects to a DataFrame
}
stageDF
}
/**
* To encode the Scan Object and return the String
* @param scan
* @return
*/
def convertScanToString(scan: Scan): String = {
val proto: org.apache.hadoop.hbase.protobuf.generated.ClientProtos.Scan = ProtobufUtil.toScan(scan)
Base64.encodeBytes(proto.toByteArray())
}
/**
* @param sc
* @param sqlContext
* @param hb
* @param admin
* @param hbaseTableName
* @param familyName
* @param rowKey
* @param readProp
* @return
*/
case class StageDataRow(
rowkey: String,
column1: String,
created_by: String,
created_date: String,
processFlag: String,
modified_by: String,
modified_date: String,
surrogate_key: String)
object StageDataRow extends Serializable {
/**
* @param result
* @param columnFamily
* @param readProp
* @return
*/
def parseStageHbaseRow(result: Result, columnFamily: String, readProp: Properties): StageDataRow = {
val rowkey = Bytes.toString(result.getRow())
val cfDataBytes = Bytes.toBytes(columnFamily)
//val p0 = rowkey.split(" ")(0) // remove time from rowKey, stats row key is for day
val p0 = rowkey
val p1 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes("column1")))
val p2 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(createdBy)))
val p3 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(createdDate)))
val p4 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(processFlag)))
val p5 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(modifiedBy)))
val p6 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(modifiedDate)))
val p7 = Bytes.toString(result.getValue(cfDataBytes, Bytes.toBytes(surrogate_key)))
StageDataRow(p0, p1, p2, p3, p4, p5, p6, p7)
}
}
}
我在HBaseUtility Scala对象的以下代码中收到一个NullPointerException
:
var stageDF: DataFrame = spark.emptyDataFrame
2019-08-07 12:12:10错误ReceiverTracker:70-流0的注销接收器:延迟2000ms重启接收器:接收数据时出错-java.lang.NullPointerException 在org.apache.spark.sql.SparkSession.emptyDataFrame $ lzycompute(SparkSession.scala:265) 在org.apache.spark.sql.SparkSession.emptyDataFrame(SparkSession.scala:264)