我有一个PySpark数据帧,我试图在映射操作后变成Pandas数据帧。如果我通过采取操作收集,我可以让它工作:
a = <scikit-learn object>
b = <scikit-learn object>
my_sdf_list = []
for x in my_sdf.take(20):
my_sdf_list.append(my_map_func(x, a, b))
如果我尝试直接收集,或将映射的数据帧转换回数据帧:
方法1
my_sdf_list = my_sdf.map(lambda x : my_map_func(x, a, b)).collect()
方法2
pandas_df = my_sdf.map(lambda x : my_map_func(x, a, b)).toDF(my_new_schema).toPandas()
然后我收到以下错误:
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 757, in collect
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2363, in _jrdd
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2283, in _prepare_for_python_RDD
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 427, in dumps
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 622, in dumps
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 107, in dump
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 224, in dump
self.save(obj)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 199, in save_function
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 236, in save_function_tuple
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 633, in _batch_appends
save(x)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 199, in save_function
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 236, in save_function_tuple
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 636, in _batch_appends
save(tmp[0])
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 193, in save_function
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 241, in save_function_tuple
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 518, in save_reduce
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 633, in _batch_appends
save(x)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 518, in save_reduce
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 518, in save_reduce
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/usr/hdp/2.3.2.0-2950/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 518, in save_reduce
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/apps/python2.7/anaconda/lib/python2.7/pickle.py", line 486, in save_string
self.write(BINSTRING + pack("<i", n) + obj)
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
最后一行的架构很简单(字符串,字符串,字符串,字符串,字符串,双精度),所以我假设pickle错误与实际架构无关。将scikit-learn对象传递给数据节点有什么用?