与CombineByKey相关的查询

时间:2019-07-17 15:33:59

标签: scala apache-spark

对于以下输入=> [('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)] combineByKey 处理后,我期望下面的输出

预期输出=> [('A', [(3, 9), (4, 16), (5, 25)]), ('B', [(1, 1), (2, 4)])]

scala> val x = sc.parallelize(Array(('B',1),('B',2),('A',3),('A',4),('A',5)))
x: org.apache.spark.rdd.RDD[(Char, Int)] = ParallelCollectionRDD[46] at parallelize at <console>:24

scala> def createCombiner (element:Int) :String = (element.toString + "," + Math.pow(element,2).toInt)
createCombiner: (element: Int)String

scala> def mergeValue (accumlator:String, element:Int) : String = (accumlator + (element.toString + Math.pow(element,2).toInt))
mergeValue: (accumlator: String, element: Int)String

scala> def mergeComb (accumlator:String ,accumlator1:String):String = (accumlator + accumlator1)
mergeComb: (accumlator: String, accumlator1: String)String

scala> val combRDD = x.map(t => (t._1, (t._2))).combineByKey(createCombiner, mergeValue, mergeComb)
combRDD: org.apache.spark.rdd.RDD[(Char, String)] = ShuffledRDD[48] at combineByKey at <console>:31

scala> combRDD.collect
res39: Array[(Char, String)] = Array((A,3,94,165,25), (B,1,12,4))

我无法获得预期的输出。因为,我是新手,所以我需要一些输入。

1 个答案:

答案 0 :(得分:1)

那又怎么样:


scala> val x = sc.parallelize(Array(('B',1),('B',2),('A',3),('A',4),('A',5)))

scala> def createCombiner(element:Int) : List[(Int, Int)] = List(element -> element * element)

scala> def mergeValue (accumulator: List[(Int, Int)], element:Int) : List[(Int, Int)] = accumulator ++ createCombiner(element)

scala> def mergeComb (accumulator: List[(Int, Int)], accumulator1: List[(Int, Int)]): List[(Int, Int)] = (accumulator ++ accumulator1)

scala> val combRDD = x.combineByKey(createCombiner, mergeValue, mergeComb)

scala> combRDD.collect
// res0: Array[(Char, List[(Int, Int)])] = Array((A,List((3,9), (4,16), (5,25))), (B,List((1,1), (2,4))))

// Or
scala> combRDD.mapValues(_.mkString("[", ", ", "]")).collect
res1: Array[(Char, String)] = Array((A,[(3,9), (4,16), (5,25)]), (B,[(1,1), (2,4)]))