我有一个带日志的json文件:
{"a": "cat1", "b": "name", "c": "Caesar", "d": "2016-10-01"}
{"a": "cat1", "b": "legs", "c": "4", "d": "2016-10-01"}
{"a": "cat1", "b": "color", "c": "black", "d": "2016-10-01"}
{"a": "cat1", "b": "tail", "c": "20cm", "d": "2016-10-01"}
{"a": "cat2", "b": "name", "c": "Dickens", "d": "2016-10-02"}
{"a": "cat2", "b": "legs", "c": "4", "d": "2016-10-02"}
{"a": "cat2", "b": "color", "c": "red", "d": "2016-10-02"}
{"a": "cat2", "b": "tail", "c": "15cm", "d": "2016-10-02"}
{"a": "cat2", "b": "ears", "c": "5cm", "d": "2016-10-02"}
{"a": "cat1", "b": "tail", "c": "10cm", "d": "2016-10-10"}
期望的输出:
("id": "cat1", "name": "Caesar", "legs": "4", "color": "black", "tail": "10cm", "day": "2016-10-10")
("id": "cat2", "name": "Dickens", "legs": "4", "color": "red", "tail": "10cm", "ears": "5cm", "day": "2016-10-02")
我可以一步一步地使用for循环和收集,但我需要使用map,flatmaps,aggregatebykey和其他火花魔法以适当的方式做到这一点
case class cat_input(a: String, b:String, c:String, d: String)
case class cat_output(id: String, name: String, legs: String, color: String, tail: String, day: String, ears: String, claws: String)
object CatLog {
def main(args: Array[String]) {
val sconf = new SparkConf().setAppName("Cat log")
val sc = new SparkContext(sconf)
sc.setLogLevel("WARN")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val df = sqlContext.read.json("cats1.txt").as[cat_input]
val step1 = df.rdd.groupBy(_.a)
//step1 = (String, Iterator[cat_input]) = (cat1, CompactBuffer(cat_input( "cat1", "name", "Caesar", "2016-10-01"), ... ) )
val step2 = step1.map(x => x._2)
//step2 = Iterator[cat_input]
val step3 = step2.map(y => (y.b,y.c))
//step3 = ("name", "Caesar")
val step4 = step3.map( case(x,y) => { cat_output(x) = y })
// it should return cat_output(id: "cat1", name: "Caesar", legs: "4", color: "black", tail: "10cm", day: NULL, ears: NULL, claws: NULL)
答案 0 :(得分:0)
假设数据具有每只猫(cat1,cat2)的唯一属性。为重复应用一些逻辑。您可以为您的案例类尝试类似的事情:
{{1}}
另请注意,我没有应用实际的"日期"因为有重复。它需要另一张地图()&最大逻辑以获得每个键的最大值,然后加入两个数据集。
答案 1 :(得分:0)
一种方法是使用aggregateByKey函数并在可变映射中存储答案。
//case class defined outside main()
case class cat_input(a: String, b:String, c:String, d: String)
val df = sqlContext.read.json("cats1.txt").as[cat_input]
val add_to_map = (a: scala.collection.mutable.Map[String,String], x: cat_input) => {
val ts = x.d
if(a contains "date"){
if((a contains x.b) && (ts>=a("date")))
{
a(x.b) = x.c
a("date")=ts
}
else if (!(a contains x.b))
{
a(x.b) = x.c
if(a("date")<ts){
a("date")=ts
}
}
}
else
{
a(x.b) = x.c
a("date")=ts
}
a
}
val merge_maps = (a:scala.collection.mutable.Map[String,String], b:scala.collection.mutable.Map[String,String]) => {
if( a("date") > b("date") ) {
a.keys.map( k => b(k) = a(k) )
a
} else {
b.keys.map( k => a(k) = b(k) )
b
}
}
val step3 = df.map(x=> (x.a, x)).aggregateByKey( scala.collection.mutable.Map[String,String]() )(add_to_map, merge_maps)