内部RDD和外部RDD时,火花累加器值不同

时间:2014-12-08 11:54:29

标签: apache-spark accumulator

我的累加器是一个数组[Array [Int]] 在更新RDD的foreach操作中的accumalutor之后,累加器(0)与预期一样,其中累加器(1)是完全丢失的数组(0,0,0)

在RDD内部,累加器值为Array(Array(4,5,6),Array(4,5,6)) 在RDD之外,累加器值是Array(Array(4,5,6),Array(0,0,0))

下面是代码

import org.apache.spark.Accumulable
import org.apache.spark.AccumulableParam
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object acc {
  def main(args: Array[String]) {
     val conf = new SparkConf().setAppName("Simple Application")
  val sc = new SparkContext(conf)
  val a =Array(Array(1,2,3),Array(4,5,6))
  val rdd = sc.parallelize(a)
  val initialValue = Array.fill[Array[Int]](2)(Array.fill[Int](3)(1))
  val accumulator = sc.accumulable(initialValue)(MatrixAccumulatorParam)
  rdd.foreach{x=>
     accumulator += (x(0),0,0)
     accumulator += (x(1),0,1)
     accumulator += (x(2),0,2)
     accumulator += (x(0),1,0)
     accumulator += (x(1),1,1)
     accumulator += (x(2),1,2)
     println("accumulator value in rdd is"+accumulator.localValue)
     }

  println("accumulator value out of rdd is :" + accumulator.value )

  }

}
object MatrixAccumulatorParam extends AccumulableParam[Array[Array[Int]], (Int, Int,   Int)] {

  def zero(initialValue: Array[Array[Int]]): Array[Array[Int]] = {
    initialValue
  }

  def addAccumulator(acc: Array[Array[Int]], value: (Int, Int, Int)): Array[Array[Int]] = {

    acc(value._2)(value._3) = value._1
    acc

  }

   def addInPlace(m1: Array[Array[Int]], m2: Array[Array[Int]]): Array[Array[Int]] = {
    val columnLength: Int = m1.length
    val rowLength: Int = m1(0).length
    var updatedMatrix = Array.ofDim[Int](columnLength, rowLength)

    var j: Int = 0
    while (j < columnLength) {
      var i =0
    while (i < rowLength) {
         val a = Math.max(m1(j)(i), m2(j)(i))
        updatedMatrix(j)(i) = a
        i += 1
      } 
      j += 1
    }

    updatedMatrix
      }


}

结果: 在RDD里面,accumalator值是Array(Array(4,5,6),Array(4,5,6)) 在RDD之外,accumalator值是Array(Array(4,5,6),Array(0,0,0))

但我期待在RDD之外的是阵列(阵列(4,5,6),阵列(4,5,6))

3 个答案:

答案 0 :(得分:3)

只要对accumulator.variable进行更新,就会调用

addAccumulator 方法

在上面的代码累加器+ =(x(0),0,0)中调用addAccumulator方法。

完成所有任务后,将调用 addInPlace 方法来聚合所有任务中的累计值。

在上面的代码中,initialValue数组(1,1,1)数组(1,1,1)和任务累加器值数组(4,5,6)数组(4,5,6)调用addInPlace方法。 / p>

在上面的代码变量中,我必须在进入循环时重置addInPlace方法,而(j&lt; columnLength){

以下代码就像魅力一样。

            while (j < columnLength) {
              i=0
                while (i < rowLength) {
                  println("m1(j)(i)"+ m1(j)(i))
                  println(" m2(j)(i))"+ m2(j)(i))
                    val a = Math.max(m1(j)(i), m2(j)(i))
                            updatedMatrix(j)(i) = a
                            i += 1
                } 
                j += 1
            }

答案 1 :(得分:0)

localValue应该是不同的,根据文件:

  • 这不是累加器的全局值。在
  • 之后获得全局值
  • 完成对数据集的操作,调用value。 *
  • 此方法的典型用法是直接改变本地值,例如,添加
  • Set的元素。 * /

答案 2 :(得分:0)

我发现将var i = 0修改为i = 0没有区别 最终结果是Array(Array(4,5,6),Array(4,5,6))

应用程序的输出由yarn logs -applicationId。

提取

代码是:

import org.apache.spark.Accumulable
import org.apache.spark.AccumulableParam
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object acc {
  def main(args: Array[String]) {
     //val conf = new SparkConf().setAppName("Simple Application")
  val conf = new SparkConf()
  conf.setSparkHome("/usr/lib/spark")
  conf.setAppName("Simple Application")
  val sc = new SparkContext(conf)
  val a =Array(Array(1,2,3),Array(4,5,6))
  val rdd = sc.parallelize(a)
  val initialValue = Array.fill[Array[Int]](2)(Array.fill[Int](3)(1))
  val accumulator = sc.accumulable(initialValue)(MatrixAccumulatorParam)
  rdd.foreach{x=>
     accumulator += (x(0),0,0)
     accumulator += (x(1),0,1)
     accumulator += (x(2),0,2)
     accumulator += (x(0),1,0)
     accumulator += (x(1),1,1)
     accumulator += (x(2),1,2)
     val columnLength: Int = accumulator.localValue.length
     val rowLength: Int = accumulator.localValue(0).length
     var j: Int = 0
     var i: Int = 0
     println("accumulator")
     while(j < columnLength){
        i =0 
        while(i<rowLength){
            println(accumulator.localValue(j)(i))
            i += 1
        }
        j+=1
     }
     println("accumulator value in rdd is"+accumulator.localValue)
     }
     val columnLength: Int = accumulator.value.length
     val rowLength: Int = accumulator.value(0).length
     var j: Int = 0
     var i: Int = 0
     println("total")
     while(j < columnLength){
        i =0 
        while(i<rowLength){
            println(accumulator.value(j)(i))
            i += 1
        }
        j+=1
     }

  println("accumulator value out of rdd is :" + accumulator.value )

  }

}
object MatrixAccumulatorParam extends AccumulableParam[Array[Array[Int]], (Int, Int,   Int)] {

  def zero(initialValue: Array[Array[Int]]): Array[Array[Int]] = {
    initialValue
  }

  def addAccumulator(acc: Array[Array[Int]], value: (Int, Int, Int)): Array[Array[Int]] = {

    acc(value._2)(value._3) = value._1
    acc
  }

   def addInPlace(m1: Array[Array[Int]], m2: Array[Array[Int]]): Array[Array[Int]] = {
    val columnLength: Int = m1.length
    val rowLength: Int = m1(0).length
    var updatedMatrix = Array.ofDim[Int](columnLength, rowLength)

    var j: Int = 0
    var i: Int = 0
    while (j < columnLength) {
    i =0
    while (i < rowLength) {
        println("m1("+j+")("+i+")="+ m1(j)(i) + " m2("+j+")("+i+")="+ m2(j)(i))
        val a = Math.max(m1(j)(i), m2(j)(i))
        updatedMatrix(j)(i) = a
        i += 1
      } 
      j += 1
    }

    updatedMatrix
  }
}

结果是:

accumulator
4
5
6
4
5
6

accumulator
1
2
3
1
2
3

m1(0)(0)=1 m2(0)(0)=1
m1(0)(1)=1 m2(0)(1)=2
m1(0)(2)=1 m2(0)(2)=3
m1(1)(0)=1 m2(1)(0)=1
m1(1)(1)=1 m2(1)(1)=2
m1(1)(2)=1 m2(1)(2)=3
m1(0)(0)=1 m2(0)(0)=4
m1(0)(1)=2 m2(0)(1)=5
m1(0)(2)=3 m2(0)(2)=6
m1(1)(0)=1 m2(1)(0)=4
m1(1)(1)=2 m2(1)(1)=5
m1(1)(2)=3 m2(1)(2)=6

total
4
5
6
4
5
6

并将代码修改为:

    //var i: Int = 0
    while (j < columnLength) {
    var i =0

结果是:

m1(0)(0)=1 m2(0)(0)=1
m1(0)(1)=1 m2(0)(1)=2
m1(0)(2)=1 m2(0)(2)=3
m1(1)(0)=1 m2(1)(0)=1
m1(1)(1)=1 m2(1)(1)=2
m1(1)(2)=1 m2(1)(2)=3
m1(0)(0)=1 m2(0)(0)=4
m1(0)(1)=2 m2(0)(1)=5
m1(0)(2)=3 m2(0)(2)=6
m1(1)(0)=1 m2(1)(0)=4
m1(1)(1)=2 m2(1)(1)=5
m1(1)(2)=3 m2(1)(2)=6
total
4
5
6
4
5
6

accumulator
1
2
3
1
2
3

accumulator
4
5
6
4
5
6

最终结果是一样的。

但我有两个问题:

  • 我不知道为什么两个输出顺序不一样。
  • 为什么要调用addInplace函数两次?
    • 我想我知道为什么这个函数会被调用两次,但我不确定
      • initialize:Array(Array(1,1,1),Array(1,1,1)
      • 从任务输出:Array(Array(1,2,3),Array(1,2,3)
      • 从另一个任务输出:Array(Array(4,5,6),Array(4,5,6)

@Vijay Innamuri