所以我在Spark 1.6中的代码工作正常,而相同的代码在Spark 2.2中运行时给出了空指针异常 我目前通过IntelliJ在本地运行所有内容:
val sparkConf = new SparkConf()
.setAppName("HbaseSpark")
.setMaster("local[*]")
.set("spark.hbase.host", "localhost")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
val df = sqlContext
.read
.format("com.databricks.spark.csv")
.option("delimiter", "\001")
.load("/Users/11130/small")
val df1 = df.withColumn("row_key", concat(col("C3"), lit("_"), col("C5"), lit("_"), col("C0")))
df1.registerTempTable("mytable")
val newDf = sqlContext.sql("Select row_key, C0, C1, C2, C3, C4, C5, C6, C7," +
"C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19 from mytable")
val rdd = newDf.rdd
val finalRdd = rdd.map(row => (row(0).toString, row(1).toString, row(2).toString, row(3).toString, row(4).toString, row(5).toString, row(6).toString,
row(7).toString, row(8).toString, row(9).toString, row(10).toString, row(11).toString, row(12).toString, row(13).toString,
row(14).toString, row(15).toString, row(16).toString, row(17).toString, row(18).toString, row(19).toString, row(20).toString))
finalRdd.toHBaseTable("mytable")
.toColumns("event_id", "device_id", "uidx", "session_id", "server_ts", "client_ts", "event_type", "data_set_name",
"screen_name", "card_type", "widget_item_whom", "widget_whom", "widget_v_position", "widget_item0_h_position",
"publisher_tag", "utm_medium", "utm_source", "utmCampaign", "referrer_url", "notificationClass")
.inColumnFamily("mycf")
.save()
然而,当我在Spark2.2中编写时,相同的代码在将rdd转换为finalRdd时会出现空指针异常
val spark = SparkSession
.builder
.appName("FunnelSpark")
.master("local[*]")
.config("spark.hbase.host", "localhost")
.getOrCreate
val sc = spark.sparkContext
sc.hadoopConfiguration.set("spark.hbase.host", "localhost")
val df = spark
.read
.option("delimiter", "\001")
.csv("/Users/11130/small")
val df1 = df.withColumn("row_key", concat(col("_c3"), lit("_"), col("_c5"), lit("_"), col("_c0")))
df1.createOrReplaceTempView("mytable")
val newDf = spark.sql("Select row_key, _c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7," +
"_c8, _c9, _c10, _c11, _c12, _c13, _c14, _c15, _c16, _c17, _c18, _c19 from mytable")
val rdd = newDf.rdd
val finalRdd = rdd.map(row => (row(0).toString, row(1).toString, row(2).toString, row(3).toString, row(4).toString, row(5).toString, row(6).toString,
row(7).toString, row(8).toString, row(9).toString, row(10).toString, row(11).toString, row(12).toString, row(13).toString,
row(14).toString, row(15).toString, row(16).toString, row(17).toString, row(18).toString, row(19).toString, row(20).toString))
println(finalRdd.first())
spark.stop()
Stacktrace:https://jpst.it/15srX
答案 0 :(得分:2)
这是因为您的代码非常不安全。当你打电话:
row(i).toString
每次遇到null
值时都会抛出NPE。
您应该使用:
row.getString(i)
您的1.6程序使用的源码不同于2.2,而spark-csv
与csv
格式相似,但不一样。第一个将空字符串视为空字符串,第二个视为nulls
。