向Hbase插入1亿行数据

时间:2019-11-24 12:17:23

标签: hbase ubuntu-18.04

我正在尝试使用Spark HortonWorks Connector(SHC)将数据导入HBase数据库 这是我的脚本:

import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.DataFrame
object Simple{
  def main(args: Array[String]) {

val df = spark.read.csv("/home/hung/Desktop/data/100mils.csv")

val newNames = Seq("id", "c1", "c2", "c3","c4","c5")
val dfRenamed = df.toDF(newNames: _*)

def my_catalog = s"""{
       |"table":{"namespace":"default", "name":"table1"},
       |"rowkey":"key",
       |"columns":{
         |"id":{"cf":"rowkey", "col":"key", "type":"string"},
         |"c2":{"cf":"science", "col":"math", "type":"string"},
         |"c3":{"cf":"science", "col":"physics", "type":"string"},
         |"c4":{"cf":"science", "col":"chemistry", "type":"string"},
         |"c5":{"cf":"language", "col":"english", "type":"string"},
         |"c6":{"cf":"language", "col":"chinese", "type":"string"}
       |}
     |}""".stripMargin

dfRenamed
.write.options(Map(HBaseTableCatalog.tableCatalog 
                   -> my_catalog, HBaseTableCatalog.newTable 
                   ->"5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()

这是我尝试编译代码时的错误

import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.DataFrame
object Simple{
  def main(args: Array[String]) {

val df = spark.read.csv("/home/hung/Desktop/data/100mils.csv")

val newNames = Seq("id", "c1", "c2", "c3","c4","c5")
val dfRenamed = df.toDF(newNames: _*)

def my_catalog = s"""{
       |"table":{"namespace":"default", "name":"table1"},
       |"rowkey":"key",
       |"columns":{
         |"id":{"cf":"rowkey", "col":"key", "type":"string"},
         |"c2":{"cf":"science", "col":"math", "type":"string"},
         |"c3":{"cf":"science", "col":"physics", "type":"string"},
         |"c4":{"cf":"science", "col":"chemistry", "type":"string"},
         |"c5":{"cf":"language", "col":"english", "type":"string"},
         |"c6":{"cf":"language", "col":"chinese", "type":"string"}
       |}
     |}""".stripMargin

dfRenamed
.write.options(Map(HBaseTableCatalog.tableCatalog 
                   -> my_catalog, HBaseTableCatalog.newTable 
                   ->"5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()

我遵循github.com/hortonworks-spark/shc中的指南。 这是我第一次使用scala,因为hbase是我的最终项目。

0 个答案:

没有答案