Question

I have translated the below Spark Scala code to a Python version.

package wscalalearning00
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import org.apache.spark.util.StatCounter

object wtry001 {
 def main(args: Array[String]) {

     Logger.getLogger("org").setLevel(Level.ERROR)
   class BballStatCounter extends Serializable {
          val stats: StatCounter = new StatCounter()
          var missing: Long = 0

          def add(x: Double): BballStatCounter = {
             if (x.isNaN) {
                  missing += 1
                } else {

                stats.merge(x)
                }
                this
              }

  }
 object BballStatCounter extends Serializable {
  def apply(x: Double) = new BballStatCounter().add(x)
}

     // Create a SparkContext using every core of the local machine
    val sc = new SparkContext(new SparkConf().setAppName("Spark Word Count").setMaster("local")) 
    val testData = (1 to 10000).toArray.map(x=>x.toDouble)
    val stats1 = sc.parallelize(testData)
    val stat3 = stats1.map(b=>BballStatCounter(b))
    stat3.foreach(println)        

 }
}

Python version:

 import math
    import findspark
    findspark.init()
    from pyspark.sql import SparkSession
    from pyspark.statcounter import StatCounter
    class BballStatCounter(object):

        stats = StatCounter()
        missing=0

        @staticmethod
        def add(x):
            print("add")
            if math.isnan(x):
                BballStatCounter.missing += x
            else:
               BballStatCounter.stats.merge(x)
            return BballStatCounter.stats

    conf = SparkConf().setAppName("SparkExampleRDD").setMaster("local")
    sc = SparkContext(conf=conf)
    testData =map(lambda x: float(x),range(0,10000))
    stats1 = sc.parallelize(testData)
    print(stats1)
    stat3 = stats1.map(lambda b: BballStatCounter.add(b))
    stat3.foreach(print)

On executing the above Python code, it prints (count: 22, mean: 10.5, stdev: 6.34428877022, max: 21.0, min: 0.0) where as the Scala code prints something like this BballStatCounter$2@7a811dd5.I think that Scala code is returning instances of same class. Please let me know if my general approach and/or syntax are wrong. Thanks.

Answer 1

尝试

P4EDITOR

然后做

class BballStatCounter:

   def __init__(self):
      self.stats = StatCounter()
      self.missing = 0

    @staticmethod
    def add(x):
        stats = StatCounter()
        missing = 0
        if math.isnan(x):
            BballStatCounter.missing += x
        else:
            BballStatCounter.stats.merge(x,stats,missing)


    def merge(other,stats,missing):
       # stats = BballStatCounter.StatCounter()
        #other = BballStatCounter()
        BballStatCounter.stats.merge(other.stats)
        BballStatCounter.missing = +other.missing
        return self

我可能没有给您正确的代码，因为我对函数的功能了解甚少。只关注我如何在函数之间传递变量。- 希望你能找到答案。

Translating the companion object in scala to Python

1 个答案: