我在并行化的RDD中有多个2-D numpy数组,我调用了一个map函数,它在numpy数组上执行操作并返回一个2-D numpy数组,但当我调用collect时,我得到一个断言错误。有谁知道发生了什么事?
class JacobiNd(Stencil):
def __init__(self, dimensions, radius, use_moore=False, backend='c', boundary_handling='clamp', **kwargs):
if use_moore:
neighborhoods = [Neighborhood.moore_neighborhood(radius=radius, dim=dimensions, include_origin=False)]
else:
neighborhoods = [Neighborhood.von_neuman_neighborhood(radius=radius, dim=dimensions, include_origin=False)]
super(JacobiNd, self).__init__(backend=backend, neighborhoods=neighborhoods, boundary_handling=boundary_handling, **kwargs
self.neighbor_weight = 1.0 / len(neighborhoods[0])
def kernel(self, in_grid, out_grid):
for x in self.interior_points(out_grid):
out_grid[x] = 0.0
for y in self.neighbors(x, 0):
out_grid[x] += self.neighbor_weight * in_grid[y]
stencil = JacobiNd(dimensions = 2,radius = 1).kernel
define jacobi_spark(twoD_data):
out = np.zeros_like(twoD_data)
stencil(twoD_data,out)
return out
twoD_jacobi_rdd = sc.parallelize(partitioned_twoD_data,twoD_partitions)
test_run = twoD_jacobi_rdd.map(lambda x: jacobi_spark(x))
print test_run.collect()
如果运行此命令,给定一个传入的二维数组,模板的作用是什么:
out = stencil(twoD_array_data)
我得到一个新的二维数组
断言错误:
Traceback (most recent call last):
File "File.py", line 90, in <module>
print test_run.collect()
File "/home/S/spark-1.4.0/python/pyspark/rdd.py", line 188, in __repr__
return self._jrdd.toString()
File "/home/S/spark-1.4.0/python/pyspark/rdd.py", line 2351, in _jrdd
pickled_cmd, bvars, env, includes = _prepare_for_python_RDD(self.ctx, command, self)
File "/home/S/spark-1.4.0/python/pyspark/rdd.py", line 2271, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/home/S/spark-1.4.0/python/pyspark/serializers.py", line 427, in dumps
return cloudpickle.dumps(obj, 2)
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 622, in dumps
cp.dump(obj)
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 107, in dump
return Pickler.dump(self, obj)
File "/usr/lib/python2.7/pickle.py", line 224, in dump
self.save(obj)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 199, in save_function
self.save_function_tuple(obj)
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 236, in save_function_tuple
save((code, closure, base_globals))
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/usr/lib/python2.7/pickle.py", line 636, in _batch_appends
save(tmp[0])
File "/usr/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 485, in save_reduce
save(cls)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 353, in save_global
self.save_reduce(typ, (obj.__name__, obj.__bases__, d), obj=obj)
File "/home/S/spark-1.4.0/python/pyspark/cloudpickle.py", line 504, in save_reduce
self.memoize(obj)
File "/usr/lib/python2.7/pickle.py", line 244, in memoize
assert id(obj) not in self.memo
AssertionError
解决方案:我只需要将JacobiNd类放入自己的文件中。不完全确定为什么,但它确实有效。任何人都可以解释原因吗?