运行spark scala代码进行批量加载时出错

时间:2017-09-14 16:59:53

标签: scala apache-spark hbase

我在REPL中使用以下代码来创建hfiles并将批量加载到hbase中。我使用了相同的代码并完成了spark-submit它工作正常但没有错误但是当我在REPL中运行它时它会抛出错误

import org.apache.spark._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable}
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StringType

import scala.collection.mutable.ArrayBuffer
import org.apache.hadoop.hbase.KeyValue
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions


val cdt = "dt".getBytes
val ctemp="temp".getBytes
val ctemp_min="temp_min".getBytes
val ctemp_max="temp_max".getBytes
val cpressure="pressure".getBytes
val csea_level="sea_level".getBytes
val cgrnd_level="grnd_level".getBytes
val chumidity="humidity".getBytes
val ctemp_kf="temp_kf".getBytes
val cid="id".getBytes
val cweather_main="weather_main".getBytes
val cweather_description="weather_description".getBytes
val cweather_icon="weather_icon".getBytes
val cclouds_all="clouds_all".getBytes
val cwind_speed="wind_speed".getBytes
val cwind_deg="wind_deg".getBytes
val csys_pod="sys_pod".getBytes
val cdt_txt="dt_txt".getBytes
val crain="rain".getBytes
val COLUMN_FAMILY = "data".getBytes
val cols = ArrayBuffer(cdt,ctemp,ctemp_min,ctemp_max,cpressure,csea_level,cgrnd_level,chumidity,ctemp_kf,cid,cweather_main,cweather_description,cweather_icon,cclouds_all,cwind_speed,cwind_deg,csys_pod,cdt_txt,crain)
val rowKey = new ImmutableBytesWritable()

val conf = HBaseConfiguration.create()

 val ZOOKEEPER_QUORUM = "address"

conf.set("hbase.zookeeper.quorum", ZOOKEEPER_QUORUM);

val connection = ConnectionFactory.createConnection(conf)

 val df = sqlContext.read.format("com.databricks.spark.csv").option("header","true").option("inferschema","true").load("Hbasedata/Weatherdata.csv")

val rdd = df.flatMap(x => {                       //Error when i run this
        rowKey.set(x(0).toString.getBytes)
        for(i <- 0 to cols.length - 1) yield {
          val index = x.fieldIndex(new String(cols(i)))
          val value = if (x.isNullAt(index)) "".getBytes else x(index).toString.getBytes
          (rowKey,new KeyValue(rowKey.get, COLUMN_FAMILY, cols(i), value))
        }
      })

抛出以下错误

org.apache.spark.SparkException: Task not serializable
    at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
    at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
    at org.apache.spark.SparkContext.clean(SparkContext.scala:2067)
    at org.apache.spark.rdd.RDD$$anonfun$flatMap$1.apply(RDD.scala:333)
    at org.apache.spark.rdd.RDD$$anonfun$flatMap$1.apply(RDD.scala:332)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
    at org.apache.spark.rdd.RDD.flatMap(RDD.scala:332)
    at org.apache.spark.sql.DataFrame.flatMap(DataFrame.scala:1418)

当我尝试创建rdd时抛出错误。我在spark-submit中使用了相同的代码它工作正常。

1 个答案:

答案 0 :(得分:0)

中的问题
val rowKey = new ImmutableBytesWritable()

ImmutableBytesWritable不可序列化,位于“flatMap”函数之外。请检查异常完整堆栈跟踪。

你可以在“flatMap”函数中移动提到的语句,至少是为了检查。