我在Spark Streaming中使用HBaseContext.BulkGet API。下面是我的代码。
def enrichMessage(rdd: RDD[(String, SubscriberDetails)]): RDD[(String, SubscriberDetails)] = {
def processResult(key: String, message: SubscriberDetails, result : Result): (String, SubscriberDetails) ={
var manufacturer: String = "Y"
var model: String = "Y"
val resultRow: Array[Byte] = result.getRow
if (resultRow != null) {
val cells = result.rawCells()
//Get the values from HBase result
for(cell <- cells){
if(Bytes.toString(CellUtil.cloneQualifier(cell)).equalsIgnoreCase("manufacturer")) {
manufacturer = Bytes.toString(CellUtil.cloneValue(cell))
}
if(Bytes.toString(CellUtil.cloneQualifier(cell)).equalsIgnoreCase("model")) {
model = Bytes.toString(CellUtil.cloneValue(cell))
}
}
//Enrich message with the fetched values
message.manufacturer = manufacturer
message.model = model
}
(key, message)
}
var mssg : SubscriberDetails = null
var key: String = null
val enrichedRDD: RDD[(String, SubscriberDetails)] = hbaseContext.bulkGet[(String, SubscriberDetails), (String, SubscriberDetails)](
TableName.valueOf("prod:customer"),
1000,
rdd, //input RDD
(record => {
key = record._1
mssg = record._2
new Get(Bytes.toBytes(mssg.imsi))
}),
(result: Result) => processResult(key, mssg, result)
)
enrichedRDD
}
上述方法存在巨大的性能问题。提取70K记录大约需要30秒(作为流式传输的一部分,这些70K记录将在1秒内收到)。我有两个问题。
hbaseConfig.set(TableInputFormat.INPUT_TABLE, schemdto.tableName);
hbaseConfig.set(TableInputFormat.SCAN_COLUMNS, "fname:column1 fname:column2");
谢谢。