I have translated the below Spark Scala code to a Python version.
package wscalalearning00
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import org.apache.spark.util.StatCounter
object wtry001 {
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
class BballStatCounter extends Serializable {
val stats: StatCounter = new StatCounter()
var missing: Long = 0
def add(x: Double): BballStatCounter = {
if (x.isNaN) {
missing += 1
} else {
stats.merge(x)
}
this
}
}
object BballStatCounter extends Serializable {
def apply(x: Double) = new BballStatCounter().add(x)
}
// Create a SparkContext using every core of the local machine
val sc = new SparkContext(new SparkConf().setAppName("Spark Word Count").setMaster("local"))
val testData = (1 to 10000).toArray.map(x=>x.toDouble)
val stats1 = sc.parallelize(testData)
val stat3 = stats1.map(b=>BballStatCounter(b))
stat3.foreach(println)
}
}
Python version:
import math
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.statcounter import StatCounter
class BballStatCounter(object):
stats = StatCounter()
missing=0
@staticmethod
def add(x):
print("add")
if math.isnan(x):
BballStatCounter.missing += x
else:
BballStatCounter.stats.merge(x)
return BballStatCounter.stats
conf = SparkConf().setAppName("SparkExampleRDD").setMaster("local")
sc = SparkContext(conf=conf)
testData =map(lambda x: float(x),range(0,10000))
stats1 = sc.parallelize(testData)
print(stats1)
stat3 = stats1.map(lambda b: BballStatCounter.add(b))
stat3.foreach(print)
On executing the above Python code, it prints (count: 22, mean: 10.5, stdev: 6.34428877022, max: 21.0, min: 0.0)
where as the Scala code prints something like this BballStatCounter$2@7a811dd5
.I think that Scala code is returning instances of same class. Please let me know if my general approach and/or syntax are wrong. Thanks.
答案 0 :(得分:0)
尝试
P4EDITOR
然后做
class BballStatCounter:
def __init__(self):
self.stats = StatCounter()
self.missing = 0
@staticmethod
def add(x):
stats = StatCounter()
missing = 0
if math.isnan(x):
BballStatCounter.missing += x
else:
BballStatCounter.stats.merge(x,stats,missing)
def merge(other,stats,missing):
# stats = BballStatCounter.StatCounter()
#other = BballStatCounter()
BballStatCounter.stats.merge(other.stats)
BballStatCounter.missing = +other.missing
return self
我可能没有给您正确的代码,因为我对函数的功能了解甚少。只关注我如何在函数之间传递变量。- 希望你能找到答案。