如何通过Spark-sql上下文连接hbase?

时间:2017-02-22 09:04:28

标签: apache-spark hbase apache-spark-sql

我正在开发一个场景,我想从spark sql访问hbase并将其作为数据框读取。下面是代码。

import java.io.IOException;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import com.google.protobuf.ServiceException;

public class hbasesql {

public static void main(String args[])
{

SparkConf conf = new SparkConf().setAppName("HbaseStreamTest").setMaster("yarn-client");
conf.set("spark.executor.extraClassPath", "/etc/hbase/conf/hbase-site.xml");
conf.set("hbase.zookeeper.quorum", "=servername:2181");
conf.set("hbase.zookeeper.property.clientPort", "2181");


Configuration hconf = HBaseConfiguration.create();
hconf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));
hconf.addResource(new Path("/etc/hbase/conf/core-site.xml"));

final  JavaSparkContext context =   new JavaSparkContext(conf);

SQLContext sqlContext = new org.apache.spark.sql.SQLContext(context);

try {
        HBaseAdmin.checkHBaseAvailable(hconf);
        System.out.println("HBase is running"); //This works
    } catch (ServiceException e) {
        System.out.println("HBase is not running");
        e.printStackTrace();
    } catch (MasterNotRunningException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (ZooKeeperConnectionException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }


    String sqlMapping = "KEY_FIELD STRING :key" + ", sql_firstname STRING c1:FIRSTNAME" + ","
            + "sql_lastname STRING c1:LASTNAME" ;

    HashMap<String, String> colMap = new HashMap<String, String>();
    colMap.put("hbase.columns.mapping", sqlMapping);
    colMap.put("hbase.table", "testtable");

    // DataFrame dfJail =
    DataFrame df = sqlContext.read().format("org.apache.hadoop.hbase.spark").options(colMap).load();
    //DataFrame df = sqlContext.load("org.apache.hadoop.hbase.spark", colMap);

    // This is useful when issuing SQL text queries directly against the
    // sqlContext object.
    df.registerTempTable("temp_emp");

    DataFrame result = sqlContext.sql("SELECT count(*) from temp_emp");
    System.out.println("df  " + df);
    System.out.println("result " + result);

df.show();  
}   
}

但是在运行相同的情况下我会得到以下异常。

17/02/22 00:44:28 INFO cluster.YarnClientSchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.8
17/02/22 00:44:28 INFO storage.BlockManagerMasterEndpoint: Registering block manager servername:32831 with 2.1 GB RAM, BlockManagerId(2, servername, 32831)
Exception in thread "main" java.lang.NullPointerException
    at org.apache.hadoop.hbase.spark.HBaseRelation.<init>(DefaultSource.scala:175)
    at org.apache.hadoop.hbase.spark.DefaultSource.createRelation(DefaultSource.scala:78)
    at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:158)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119)
    at hbasesql.main(hbasesql.java:65)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:606)
    at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
    at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
    at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
17/02/22 00:44:28 INFO spark.SparkContext: Invoking stop() from shutdown hook

我想sparkjavacontext的sqlcontext无法加载hbase-site.xml配置。还是有其他问题吗?

非常感谢任何帮助。

提前致谢:)

0 个答案:

没有答案