如何使用spark连接Hbase

时间:2015-08-12 11:46:02

标签: scala apache-spark hbase

我想用spark连接hbase。 我得到一个例外。当我试图用scala做同样的事情时,我没有得到这样的错误。

我使用的是scala 2.10.4,spark: - 1.3.0 CDH 5.4.0。

代码是:

import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.spark.SparkConf
import com.cloudera.spark.hbase.HBaseContext

object HBaseBulkPutExample {
    def main(args: Array[String]) {
        val tableName = "image_table";
        val columnFamily = "Image Data"

        val sparkConf = new SparkConf().setAppName("HBaseBulkPutExample " + tableName + " " + columnFamily)
        val sc = new SparkContext(sparkConf)

        val rdd = sc.parallelize(Array(
            (Bytes.toBytes("1"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("1")))),
            (Bytes.toBytes("2"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("2")))),
            (Bytes.toBytes("3"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("3")))),
            (Bytes.toBytes("4"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("4")))),
            (Bytes.toBytes("5"), Array((Bytes.toBytes(columnFamily), Bytes.toBytes("1"), Bytes.toBytes("5"))))
            )
        )

        val conf = HBaseConfiguration.create();
        conf.addResource(new Path("/eds/servers//hbase-1.0.1.1/conf/hbase-site.xml"));

        val hbaseContext = new HBaseContext(sc, conf);
        hbaseContext.bulkPut[(Array[Byte], Array[(Array[Byte], Array[Byte], Array[Byte])])](rdd,
           tableName,
           (putRecord) => {
               val put = new Put(putRecord._1)
               putRecord._2.foreach((putValue) => put.add(putValue._1, putValue._2, putValue._3))
               put
            },
            true);
    }
}

当我创建一个jar并执行它时,我收到以下错误:

org.apache.hadoop.hbase.DoNotRetryIOException: java.lang.NoSuchMethodError: org.apache.hadoop.net.NetUtils.getInputStream(Ljava/net/Socket;)Lorg/apache/hadoop/net/SocketInputWrapper;
    at org.apache.hadoop.hbase.ipc.RpcClient$Connection.setupIOstreams(RpcClient.java:928)
    at org.apache.hadoop.hbase.ipc.RpcClient.getConnection(RpcClient.java:1543)
    at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1442)
    at org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(RpcClient.java:1661)
    at org.apache.hadoop.hbase.ipc.RpcClient$BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1719)
    at org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$BlockingStub.get(ClientProtos.java:30304)
    at org.apache.hadoop.hbase.protobuf.ProtobufUtil.getRowOrBefore(ProtobufUtil.java:1562)
    at org.apache.hadoop.hbase.client.HTable$2.call(HTable.java:711)
    at org.apache.hadoop.hbase.client.HTable$2.call(HTable.java:709)
    at org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:114)
    at org.apache.hadoop.hbase.client.HTable.getRowOrBefore(HTable.java:715)
    at org.apache.hadoop.hbase.client.MetaScanner.metaScan(MetaScanner.java:144)
    at org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation.prefetchRegionCache(HConnectionManager.java:1140)

1 个答案:

答案 0 :(得分:0)

我不确定错误的原因是什么。看起来像调用方法与您的环境版本不匹配。

以下是如何使用spark:

连接Hbase的示例
import spark._
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat

...

val conf = HBaseConfiguration.create()
conf.set(TableInputFormat.INPUT_TABLE, "image_table")

 // Initialize 
val admin = new HBaseAdmin(conf)
if(!admin.isTableAvailable(input_table)) {
  val tableDesc = new HTableDescriptor("image_table")
  admin.createTable(tableDesc)
}

val rdd = sc.newAPIHadoopRDD(
    conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])

Result 类包括获取值的各种方法