使用spark scala将任何表动态加载到Hbase

时间:2018-10-30 18:52:47

标签: scala apache-spark hbase

下面是我试图将表动态加载到Hbase的代码,但是我遇到了Null指针异常,无法解决。请让我知道是否有任何方法可以将任何表动态加载到Hbase。

import scala.util.Failure
import org.apache.spark.sql.DataFrame
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Connection
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import java.lang.String
import org.apache.spark.sql.functions._
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.client.HTable

def hbaseDataLoad(tableName:String,hbaseTname:String,columnFname:String):Unit ={

 val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
 hiveContext.setConf("hive.exec.dynamic.partition", "true")
 hiveContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
 val conf = HBaseConfiguration.create()
 val admin = new HBaseAdmin(conf)
 val df = hiveContext.sql(s"select t.* from $tableName t order by t.firstname")

 if (!admin.isTableAvailable(hbaseTname)) {
  val tableDesc = new HTableDescriptor(hbaseTname)
  admin.createTable(tableDesc)
}

 val columnNameIndex = df.columns.zipWithIndex.map(x => (x._2, x._1)).toMap
 df.foreach( elmt => {
   val conf = HBaseConfiguration.create()
   val admin = new HBaseAdmin(conf)
   conf.set("hbase.rootdir","hdfs://")
   conf.set("hbase.zookeeper.quorum","")
   conf.setInt("hbase.zookeeper.property.clientPort", 2181)
   conf.set(TableInputFormat.INPUT_TABLE, hbaseTname)
   val myTable = new HTable(conf, hbaseTname)
   var p = new Put(elmt.getString(0).getBytes())
   for(i <- 1 until df.columns.length ) {
     p.addColumn(columnFname.getBytes(),columnNameIndex.getOrElse(i, s"c$i").getBytes,elmt.getString(i).getBytes)
  }
   myTable.put(p)
 })

}

即使在整个表中,我也得到了异常,所有的列和数据都包含在hbase中。

scala> hbaseDataLoad("test.tablename","hbase_test","cf1")
[Stage 13:>                                                         (0 + 1) / 7]18/10/30 17:56:15 ERROR scheduler.TaskSetManager: Task 0 in stage 13.0 failed 4 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 13.0 failed 4 times, most recent failure: Lost task 0.3 in stage 13.0 (TID 49, ebdp-avdc-d177p.sys.comcast.net, executor 9): java.lang.NullPointerException
        at org.apache.spark.sql.Dataset.schema(Dataset.scala:452)
        at org.apache.spark.sql.Dataset.columns(Dataset.scala:503)
        at $anonfun$hbaseDataLoad$1.apply(<console>:111)
        at $anonfun$hbaseDataLoad$1.apply(<console>:102)
        at scala.collection.Iterator$class.foreach(Iterator.scala:893)
        at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
        at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
        at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
        at org.apache.spark.scheduler.Task.run(Task.scala:109)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
  at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)

预先感谢

0 个答案:

没有答案