对于以下输入=> [('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]
用 combineByKey 处理后,我期望下面的输出
预期输出=> [('A', [(3, 9), (4, 16), (5, 25)]), ('B', [(1, 1), (2, 4)])]
scala> val x = sc.parallelize(Array(('B',1),('B',2),('A',3),('A',4),('A',5)))
x: org.apache.spark.rdd.RDD[(Char, Int)] = ParallelCollectionRDD[46] at parallelize at <console>:24
scala> def createCombiner (element:Int) :String = (element.toString + "," + Math.pow(element,2).toInt)
createCombiner: (element: Int)String
scala> def mergeValue (accumlator:String, element:Int) : String = (accumlator + (element.toString + Math.pow(element,2).toInt))
mergeValue: (accumlator: String, element: Int)String
scala> def mergeComb (accumlator:String ,accumlator1:String):String = (accumlator + accumlator1)
mergeComb: (accumlator: String, accumlator1: String)String
scala> val combRDD = x.map(t => (t._1, (t._2))).combineByKey(createCombiner, mergeValue, mergeComb)
combRDD: org.apache.spark.rdd.RDD[(Char, String)] = ShuffledRDD[48] at combineByKey at <console>:31
scala> combRDD.collect
res39: Array[(Char, String)] = Array((A,3,94,165,25), (B,1,12,4))
我无法获得预期的输出。因为,我是新手,所以我需要一些输入。
答案 0 :(得分:1)
那又怎么样:
scala> val x = sc.parallelize(Array(('B',1),('B',2),('A',3),('A',4),('A',5)))
scala> def createCombiner(element:Int) : List[(Int, Int)] = List(element -> element * element)
scala> def mergeValue (accumulator: List[(Int, Int)], element:Int) : List[(Int, Int)] = accumulator ++ createCombiner(element)
scala> def mergeComb (accumulator: List[(Int, Int)], accumulator1: List[(Int, Int)]): List[(Int, Int)] = (accumulator ++ accumulator1)
scala> val combRDD = x.combineByKey(createCombiner, mergeValue, mergeComb)
scala> combRDD.collect
// res0: Array[(Char, List[(Int, Int)])] = Array((A,List((3,9), (4,16), (5,25))), (B,List((1,1), (2,4))))
// Or
scala> combRDD.mapValues(_.mkString("[", ", ", "]")).collect
res1: Array[(Char, String)] = Array((A,[(3,9), (4,16), (5,25)]), (B,[(1,1), (2,4)]))