下面是我试图将表动态加载到Hbase的代码,但是我遇到了Null指针异常,无法解决。请让我知道是否有任何方法可以将任何表动态加载到Hbase。
import scala.util.Failure
import org.apache.spark.sql.DataFrame
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Connection
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import java.lang.String
import org.apache.spark.sql.functions._
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{HBaseAdmin,HTable,Put,Get}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.client.HTable
def hbaseDataLoad(tableName:String,hbaseTname:String,columnFname:String):Unit ={
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
hiveContext.setConf("hive.exec.dynamic.partition", "true")
hiveContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
val conf = HBaseConfiguration.create()
val admin = new HBaseAdmin(conf)
val df = hiveContext.sql(s"select t.* from $tableName t order by t.firstname")
if (!admin.isTableAvailable(hbaseTname)) {
val tableDesc = new HTableDescriptor(hbaseTname)
admin.createTable(tableDesc)
}
val columnNameIndex = df.columns.zipWithIndex.map(x => (x._2, x._1)).toMap
df.foreach( elmt => {
val conf = HBaseConfiguration.create()
val admin = new HBaseAdmin(conf)
conf.set("hbase.rootdir","hdfs://")
conf.set("hbase.zookeeper.quorum","")
conf.setInt("hbase.zookeeper.property.clientPort", 2181)
conf.set(TableInputFormat.INPUT_TABLE, hbaseTname)
val myTable = new HTable(conf, hbaseTname)
var p = new Put(elmt.getString(0).getBytes())
for(i <- 1 until df.columns.length ) {
p.addColumn(columnFname.getBytes(),columnNameIndex.getOrElse(i, s"c$i").getBytes,elmt.getString(i).getBytes)
}
myTable.put(p)
})
}
即使在整个表中,我也得到了异常,所有的列和数据都包含在hbase中。
scala> hbaseDataLoad("test.tablename","hbase_test","cf1")
[Stage 13:> (0 + 1) / 7]18/10/30 17:56:15 ERROR scheduler.TaskSetManager: Task 0 in stage 13.0 failed 4 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 13.0 failed 4 times, most recent failure: Lost task 0.3 in stage 13.0 (TID 49, ebdp-avdc-d177p.sys.comcast.net, executor 9): java.lang.NullPointerException
at org.apache.spark.sql.Dataset.schema(Dataset.scala:452)
at org.apache.spark.sql.Dataset.columns(Dataset.scala:503)
at $anonfun$hbaseDataLoad$1.apply(<console>:111)
at $anonfun$hbaseDataLoad$1.apply(<console>:102)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:921)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
预先感谢