我在REPL中使用以下代码来创建hfiles并将批量加载到hbase中。我使用了相同的代码并完成了spark-submit它工作正常但没有错误但是当我在REPL中运行它时它会抛出错误
import org.apache.spark._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable}
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StringType
import scala.collection.mutable.ArrayBuffer
import org.apache.hadoop.hbase.KeyValue
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
val cdt = "dt".getBytes
val ctemp="temp".getBytes
val ctemp_min="temp_min".getBytes
val ctemp_max="temp_max".getBytes
val cpressure="pressure".getBytes
val csea_level="sea_level".getBytes
val cgrnd_level="grnd_level".getBytes
val chumidity="humidity".getBytes
val ctemp_kf="temp_kf".getBytes
val cid="id".getBytes
val cweather_main="weather_main".getBytes
val cweather_description="weather_description".getBytes
val cweather_icon="weather_icon".getBytes
val cclouds_all="clouds_all".getBytes
val cwind_speed="wind_speed".getBytes
val cwind_deg="wind_deg".getBytes
val csys_pod="sys_pod".getBytes
val cdt_txt="dt_txt".getBytes
val crain="rain".getBytes
val COLUMN_FAMILY = "data".getBytes
val cols = ArrayBuffer(cdt,ctemp,ctemp_min,ctemp_max,cpressure,csea_level,cgrnd_level,chumidity,ctemp_kf,cid,cweather_main,cweather_description,cweather_icon,cclouds_all,cwind_speed,cwind_deg,csys_pod,cdt_txt,crain)
val rowKey = new ImmutableBytesWritable()
val conf = HBaseConfiguration.create()
val ZOOKEEPER_QUORUM = "address"
conf.set("hbase.zookeeper.quorum", ZOOKEEPER_QUORUM);
val connection = ConnectionFactory.createConnection(conf)
val df = sqlContext.read.format("com.databricks.spark.csv").option("header","true").option("inferschema","true").load("Hbasedata/Weatherdata.csv")
val rdd = df.flatMap(x => { //Error when i run this
rowKey.set(x(0).toString.getBytes)
for(i <- 0 to cols.length - 1) yield {
val index = x.fieldIndex(new String(cols(i)))
val value = if (x.isNullAt(index)) "".getBytes else x(index).toString.getBytes
(rowKey,new KeyValue(rowKey.get, COLUMN_FAMILY, cols(i), value))
}
})
抛出以下错误
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2067)
at org.apache.spark.rdd.RDD$$anonfun$flatMap$1.apply(RDD.scala:333)
at org.apache.spark.rdd.RDD$$anonfun$flatMap$1.apply(RDD.scala:332)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.flatMap(RDD.scala:332)
at org.apache.spark.sql.DataFrame.flatMap(DataFrame.scala:1418)
当我尝试创建rdd时抛出错误。我在spark-submit中使用了相同的代码它工作正常。
答案 0 :(得分:0)
中的问题
val rowKey = new ImmutableBytesWritable()
ImmutableBytesWritable不可序列化,位于“flatMap”函数之外。请检查异常完整堆栈跟踪。
你可以在“flatMap”函数中移动提到的语句,至少是为了检查。