我的累加器是一个数组[Array [Int]] 在更新RDD的foreach操作中的accumalutor之后,累加器(0)与预期一样,其中累加器(1)是完全丢失的数组(0,0,0)
在RDD内部,累加器值为Array(Array(4,5,6),Array(4,5,6)) 在RDD之外,累加器值是Array(Array(4,5,6),Array(0,0,0))
下面是代码
import org.apache.spark.Accumulable
import org.apache.spark.AccumulableParam
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object acc {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val a =Array(Array(1,2,3),Array(4,5,6))
val rdd = sc.parallelize(a)
val initialValue = Array.fill[Array[Int]](2)(Array.fill[Int](3)(1))
val accumulator = sc.accumulable(initialValue)(MatrixAccumulatorParam)
rdd.foreach{x=>
accumulator += (x(0),0,0)
accumulator += (x(1),0,1)
accumulator += (x(2),0,2)
accumulator += (x(0),1,0)
accumulator += (x(1),1,1)
accumulator += (x(2),1,2)
println("accumulator value in rdd is"+accumulator.localValue)
}
println("accumulator value out of rdd is :" + accumulator.value )
}
}
object MatrixAccumulatorParam extends AccumulableParam[Array[Array[Int]], (Int, Int, Int)] {
def zero(initialValue: Array[Array[Int]]): Array[Array[Int]] = {
initialValue
}
def addAccumulator(acc: Array[Array[Int]], value: (Int, Int, Int)): Array[Array[Int]] = {
acc(value._2)(value._3) = value._1
acc
}
def addInPlace(m1: Array[Array[Int]], m2: Array[Array[Int]]): Array[Array[Int]] = {
val columnLength: Int = m1.length
val rowLength: Int = m1(0).length
var updatedMatrix = Array.ofDim[Int](columnLength, rowLength)
var j: Int = 0
while (j < columnLength) {
var i =0
while (i < rowLength) {
val a = Math.max(m1(j)(i), m2(j)(i))
updatedMatrix(j)(i) = a
i += 1
}
j += 1
}
updatedMatrix
}
}
结果: 在RDD里面,accumalator值是Array(Array(4,5,6),Array(4,5,6)) 在RDD之外,accumalator值是Array(Array(4,5,6),Array(0,0,0))
但我期待在RDD之外的是阵列(阵列(4,5,6),阵列(4,5,6))
答案 0 :(得分:3)
addAccumulator 方法
在上面的代码累加器+ =(x(0),0,0)中调用addAccumulator方法。
完成所有任务后,将调用 addInPlace 方法来聚合所有任务中的累计值。
在上面的代码中,initialValue数组(1,1,1)数组(1,1,1)和任务累加器值数组(4,5,6)数组(4,5,6)调用addInPlace方法。 / p>
在上面的代码变量中,我必须在进入循环时重置addInPlace方法,而(j&lt; columnLength){
以下代码就像魅力一样。
while (j < columnLength) {
i=0
while (i < rowLength) {
println("m1(j)(i)"+ m1(j)(i))
println(" m2(j)(i))"+ m2(j)(i))
val a = Math.max(m1(j)(i), m2(j)(i))
updatedMatrix(j)(i) = a
i += 1
}
j += 1
}
答案 1 :(得分:0)
localValue应该是不同的,根据文件:
value
。
* 答案 2 :(得分:0)
我发现将var i = 0修改为i = 0没有区别 最终结果是Array(Array(4,5,6),Array(4,5,6))
应用程序的输出由yarn logs -applicationId。
提取代码是:
import org.apache.spark.Accumulable
import org.apache.spark.AccumulableParam
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object acc {
def main(args: Array[String]) {
//val conf = new SparkConf().setAppName("Simple Application")
val conf = new SparkConf()
conf.setSparkHome("/usr/lib/spark")
conf.setAppName("Simple Application")
val sc = new SparkContext(conf)
val a =Array(Array(1,2,3),Array(4,5,6))
val rdd = sc.parallelize(a)
val initialValue = Array.fill[Array[Int]](2)(Array.fill[Int](3)(1))
val accumulator = sc.accumulable(initialValue)(MatrixAccumulatorParam)
rdd.foreach{x=>
accumulator += (x(0),0,0)
accumulator += (x(1),0,1)
accumulator += (x(2),0,2)
accumulator += (x(0),1,0)
accumulator += (x(1),1,1)
accumulator += (x(2),1,2)
val columnLength: Int = accumulator.localValue.length
val rowLength: Int = accumulator.localValue(0).length
var j: Int = 0
var i: Int = 0
println("accumulator")
while(j < columnLength){
i =0
while(i<rowLength){
println(accumulator.localValue(j)(i))
i += 1
}
j+=1
}
println("accumulator value in rdd is"+accumulator.localValue)
}
val columnLength: Int = accumulator.value.length
val rowLength: Int = accumulator.value(0).length
var j: Int = 0
var i: Int = 0
println("total")
while(j < columnLength){
i =0
while(i<rowLength){
println(accumulator.value(j)(i))
i += 1
}
j+=1
}
println("accumulator value out of rdd is :" + accumulator.value )
}
}
object MatrixAccumulatorParam extends AccumulableParam[Array[Array[Int]], (Int, Int, Int)] {
def zero(initialValue: Array[Array[Int]]): Array[Array[Int]] = {
initialValue
}
def addAccumulator(acc: Array[Array[Int]], value: (Int, Int, Int)): Array[Array[Int]] = {
acc(value._2)(value._3) = value._1
acc
}
def addInPlace(m1: Array[Array[Int]], m2: Array[Array[Int]]): Array[Array[Int]] = {
val columnLength: Int = m1.length
val rowLength: Int = m1(0).length
var updatedMatrix = Array.ofDim[Int](columnLength, rowLength)
var j: Int = 0
var i: Int = 0
while (j < columnLength) {
i =0
while (i < rowLength) {
println("m1("+j+")("+i+")="+ m1(j)(i) + " m2("+j+")("+i+")="+ m2(j)(i))
val a = Math.max(m1(j)(i), m2(j)(i))
updatedMatrix(j)(i) = a
i += 1
}
j += 1
}
updatedMatrix
}
}
结果是:
accumulator
4
5
6
4
5
6
accumulator
1
2
3
1
2
3
m1(0)(0)=1 m2(0)(0)=1
m1(0)(1)=1 m2(0)(1)=2
m1(0)(2)=1 m2(0)(2)=3
m1(1)(0)=1 m2(1)(0)=1
m1(1)(1)=1 m2(1)(1)=2
m1(1)(2)=1 m2(1)(2)=3
m1(0)(0)=1 m2(0)(0)=4
m1(0)(1)=2 m2(0)(1)=5
m1(0)(2)=3 m2(0)(2)=6
m1(1)(0)=1 m2(1)(0)=4
m1(1)(1)=2 m2(1)(1)=5
m1(1)(2)=3 m2(1)(2)=6
total
4
5
6
4
5
6
并将代码修改为:
//var i: Int = 0
while (j < columnLength) {
var i =0
结果是:
m1(0)(0)=1 m2(0)(0)=1
m1(0)(1)=1 m2(0)(1)=2
m1(0)(2)=1 m2(0)(2)=3
m1(1)(0)=1 m2(1)(0)=1
m1(1)(1)=1 m2(1)(1)=2
m1(1)(2)=1 m2(1)(2)=3
m1(0)(0)=1 m2(0)(0)=4
m1(0)(1)=2 m2(0)(1)=5
m1(0)(2)=3 m2(0)(2)=6
m1(1)(0)=1 m2(1)(0)=4
m1(1)(1)=2 m2(1)(1)=5
m1(1)(2)=3 m2(1)(2)=6
total
4
5
6
4
5
6
accumulator
1
2
3
1
2
3
accumulator
4
5
6
4
5
6
最终结果是一样的。
但我有两个问题:
@Vijay Innamuri