这实际上是我在这里问Convert JSON objects to RDD
的问题的后续跟进我根据给出的答案实施的解决方案是
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd._
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.DeserializationFeature
case class myRec(
client_title: String,
made_on_behalf: String,
country: String,
email_address: String,
special_request_made: String,
number_of_rooms: String,
is_phone_booking: String,
cancelled: String)
def prepJson (infile:String) :RDD[myRec] = {
val input = sc.wholeTextFiles(infile).map(_._2)
input.mapPartitions(records => {
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
records.flatMap(record => {
try {
mapper.readValue(record, classOf[List[myRec]])
} catch {
case e: Exception => None
}
})
})
}
成功将json文件读入案例类后,结果为HashMap
生成的HashMap
Map(client_title -> Mr., made_on_behalf -> 0, country -> Brussel, email_address -> 15e29034@gmail.com, cancelled -> 0, is_phone_booking -> 1, special_request -> 0, ...)
Map(client_title -> Mr., made_on_behalf -> 0, country -> Bundesliga, email_address -> aae665d95c5d630@aol.com, cancelled -> 1, is_phone_booking -> 1, special_request -> 1, ...)
Map(client_title -> Mr., made_on_behalf -> 0, country -> Japan, email_address -> fef412c714ff@yahoo.com, cancelled -> 0, is_phone_booking -> 0, special_request -> 0, ...)
我想从结果案例类RDD
中提取一些特征到一个元组,所以我尝试了这个:
def process_data (data_json: String) :RDD[(Double, Array[Double])]= {
// Read data
val my_Data = prepJson(data_json)
my_Data.map{ rec =>
val values = Array(
rec.made_on_behalf.toDouble,
rec.special_request_made.toDouble,
rec.number_of_rooms.toDouble,
rec.amount.toDouble,
rec.is_phone_booking.toDouble
)
new Tuple2(rec.cancelled.toDouble, values)
}
}
当我调用函数
时val sample_data = process_data("file:///path/to/dataset.json")
sample_data.saveAsTextFile("file:///path/to/data.txt")
不幸的是我收到了这个错误
java.lang.ClassCastException: scala.collection.immutable.HashMap$HashTrieMap cannot be cast to myRec
问题是我如何浏览哈希映射,以便process_data
函数返回Tuples
数组?
谢谢!
答案 0 :(得分:1)
prepJson
方法未按预期工作。转换到案例类MyRec时,没有Rdd[MyRec]
在运行时有Rdd[Map[_,_]]
,而是无法正常工作。
我设置了一个完整的场景
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd._
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.DeserializationFeature
case class MyRec(
client_title: String,
made_on_behalf: String,
country: String,
email_address: String,
special_request_made: String,
number_of_rooms: String,
is_phone_booking: String,
cancelled: String)
object D {
def main(args: Array[String]): Unit = {
val c = new SparkConf()
.setAppName("Spark eCommerce ETL.")
.setMaster("local[*]")
val sc = new SparkContext(c)
// this is what you actually have
// ==================================================
// val mapRdd: RDD[Map[_, _]] = prepJson("./data/dataset.json", sc)
// // debug
// val result = mapRdd.collect()
// println("Result of prepJson is : " + result.mkString(", ") )
// debug end
// But this is what you want
// ==================================================
val myRecRdd: RDD[MyRec] = prepJsonDoesNotWork("./data/dataset.json", sc)
val a = myRecRdd.collect()
println( "MyRecs read from prepJson: " + a.mkString(", "))
val x = process_data( myRecRdd, sc)
x.saveAsTextFile("./data/output.json")
sc.stop
}
def process_data( myRecRdd : RDD[MyRec], sc: SparkContext): RDD[(Double, Array[Double])] = {
myRecRdd.map { rec =>
val values = Array(
rec.made_on_behalf.toDouble,
rec.special_request_made.toDouble,
rec.number_of_rooms.toDouble,
// rec.amount.toDouble,
rec.is_phone_booking.toDouble
)
new Tuple2(rec.cancelled.toDouble, values)
}
}
def prepJsonDoesNotWork(inputFile: String, sc: SparkContext) : RDD[MyRec] = {
val input = sc.wholeTextFiles(inputFile).map(_._2)
// Parse it into a specific case class. We use mapPartitions beacuse:
// (a) ObjectMapper is not serializable so we either create a singleton object encapsulating ObjectMapper
// on the driver and have to send data back to the driver to go through the singleton object.
// Alternatively we can let each node create its own ObjectMapper but that's expensive in a map
// (b) To solve for creating an ObjectMapper on each node without being too expensive we create one per
// partition with mapPartitions. Solves serialization and object creation performance hit.
val result = input.mapPartitions((records: Iterator[String]) => {
// mapper object created on each executor node
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
// We use flatMap to handle errors
// by returning an empty list (None) if we encounter an issue and a
// list with one element if everything is ok (List(_)).
records.flatMap(record => {
println("RECORD: " + record)
try {
val x: List[MyRec] = mapper.readValue(record, classOf[List[MyRec]])
println("MyRecs:" + x)
println("x size: " + x.size)
println("x0: " + x(0))
println("x1: " + x(1))
x.foreach((t: MyRec) => println(" My rec in prepJson " + t))
x
} catch {
case e: Exception => None
}
})
})
result
}
}
和json输入
[
{"time": "2015-05-01 02:25:47",
"client_title": "Mr.",
"made_on_behalf": 0,
"country": "Brussel",
"email_address": "15e29034@gmail.com"},
{"time": "2015-05-01 04:15:03",
"client_title": "Mr.",
"made_on_behalf": 0,
"country": "Bundesliga",
"email_address": "aae665d95c5d630@aol.com"},
{"time": "2015-05-01 06:29:18",
"client_title": "Mr.",
"made_on_behalf": 0,
"country": "Japan",
"email_address": "fef412c714ff@yahoo.com"}
]
(基本)输出变为
RECORD: [
{"time": "2015-05-01 02:25:47",
"client_title": "Mr.",
"made_on_behalf": 0,
"country": "Brussel",
"email_address": "15e29034@gmail.com"},
{"time": "2015-05-01 04:15:03",
"client_title": "Mr.",
"made_on_behalf": 0,
"country": "Bundesliga",
"email_address": "aae665d95c5d630@aol.com"},
{"time": "2015-05-01 06:29:18",
"client_title": "Mr.",
"made_on_behalf": 0,
"country": "Japan",
"email_address": "fef412c714ff@yahoo.com"}
]
MyRecs:List(Map(email_address -> 15e29034@gmail.com, country -> Brussel, client_title -> Mr., time -> 2015-05-01 02:25:47, made_on_behalf -> 0), Map(email_address -> aae665d95c5d630@aol.com, country -> Bundesliga, client_title -> Mr., time -> 2015-05-01 04:15:03, made_on_behalf -> 0), Map(email_address -> fef412c714ff@yahoo.com, country -> Japan, client_title -> Mr., time -> 2015-05-01 06:29:18, made_on_behalf -> 0))
x size: 3
x0: Map(email_address -> 15e29034@gmail.com, country -> Brussel, client_title -> Mr., time -> 2015-05-01 02:25:47, made_on_behalf -> 0)
x1: Map(email_address -> aae665d95c5d630@aol.com, country -> Bundesliga, client_title -> Mr., time -> 2015-05-01 04:15:03, made_on_behalf -> 0)
MyRecs read from prepJson:
如您所见,x0(此处首先阅读MyRec条目)是
因此,转换为案例类的实例不起作用。出于这个原因,process_data' either fails (cast exception) or the returned
Rdd`中的转换为空(在我的情况下,请参阅"从prepJson读取的MyRecs:"这是空的。
这意味着,prepJson的正确典型是
def prepJson(inputFile: String, sc: SparkContext) : RDD[Map[_,_]] = {
val input = sc.wholeTextFiles(inputFile).map(_._2)
// Parse it into a specific case class. We use mapPartitions beacuse:
// (a) ObjectMapper is not serializable so we either create a singleton object encapsulating ObjectMapper
// on the driver and have to send data back to the driver to go through the singleton object.
// Alternatively we can let each node create its own ObjectMapper but that's expensive in a map
// (b) To solve for creating an ObjectMapper on each node without being too expensive we create one per
// partition with mapPartitions. Solves serialization and object creation performance hit.
val result = input.mapPartitions((records: Iterator[String]) => {
// mapper object created on each executor node
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
// We use flatMap to handle errors
// by returning an empty list (None) if we encounter an issue and a
// list with one element if everything is ok (List(_)).
records.flatMap(record => {
println("RECORD: " + record)
try {
val x: List[Map[_,_]] = mapper.readValue(record, classOf[List[Map[_,_]]])
x
} catch {
case e: Exception => None
}
})
})
result
}
然后正确返回
Result of prepJson is : Map(email_address -> 15e29034@gmail.com, country -> Brussel, client_title -> Mr., time -> 2015-05-01 02:25:47, made_on_behalf -> 0), Map(email_address -> aae665d95c5d630@aol.com, country -> Bundesliga, client_title -> Mr., time -> 2015-05-01 04:15:03, made_on_behalf -> 0), Map(email_address -> fef412c714ff@yahoo.com, country -> Japan, client_title -> Mr., time -> 2015-05-01 06:29:18, made_on_behalf -> 0)
在Convert JSON objects to RDD中询问@Holden或@ zero323的工作方法,因为我不是杰克逊的专家。
为什么不使用SparkSQL进行解析?这当然要求你有一个json文件,每行有一个MyRec条目(而不是完全兼容的JSON),所以
{"time": "2015-05-01 02:25:47", "client_title": "Mr.", "made_on_behalf": 0, "country": "Brussel", "email_address": "15e29034@gmail.com"}
{"time": "2015-05-01 04:15:03", "client_title": "Mr.", "made_on_behalf": 0, "country": "Bundesliga", "email_address": "aae665d95c5d630@aol.com"}
只需通过
以该格式从json创建SQLContext
和DataFrame
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{SQLContext, DataFrame, UserDefinedFunction}
import org.apache.spark.sql.functions._
def readJson(inputFile: String, sc: SparkContext) : DataFrame = {
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
sqlContext.read.json(inputFile)
}
加载DataFrame
后,您可以使用