在spark文档中提供的示例代码中,我们有以下内容:
>>> from pyspark.accumulators import AccumulatorParam
>>> class VectorAccumulatorParam(AccumulatorParam):
... def zero(self, value):
... return [0.0] * len(value)
... def addInPlace(self, val1, val2):
... for i in xrange(len(val1)):
... val1[i] += val2[i]
... return val1
>>> va = sc.accumulator([1.0, 2.0, 3.0], VectorAccumulatorParam())
>>> va.value
[1.0, 2.0, 3.0]
>>> def g(x):
... global va
... va += [x] * 3
>>> rdd.foreach(g)
>>> va.value
[7.0, 8.0, 9.0]
但是,如果我要创建一个流并将流中的输入添加到累加器,我该怎么做呢?
您似乎只能向累加器添加列表或不添加数组,但不能添加dstream。
答案 0 :(得分:1)
只需在onStartCommand()
或va.add()
中致电foreachRDD
,如:
transform
输出:
sc = SparkContext()
ssc = StreamingContext(sc, 3)
class VectorAccumulatorParam(AccumulatorParam):
def zero(self, value):
return [0] * len(value)
def addInPlace(self, val1, val2):
for i in xrange(len(val1)):
val1[i] += val2[i]
return val1
va = sc.accumulator([0] * 3, VectorAccumulatorParam())
data = range(30)
lines = ssc.queueStream(map(lambda x: sc.parallelize([x]), zip(*[iter(data)] * 3)))
# lines.transform(lambda rdd: rdd.foreach(lambda x: va.add(x)) or rdd) \
# .foreachRDD(lambda x: print("Now Accumulator Value is ({0})".format(va.value)))
lines.transform(lambda rdd: print("Now Accumulator Value is ({0})".format(va.value)) or rdd) \
.foreachRDD(lambda rdd: rdd.foreach(lambda x: va.add(x)))
lines.pprint()
ssc.start()
ssc.awaitTermination()