我试图在spark UDF中加载json文件并使用它来查询某些内容。我需要做的是使用数据框中的列值(storeId)并在
中使用它但是,我得到了一个泡菜错误。如果我在没有sqlContext的情况下尝试编码,那么它可以工作。
是否有任何解决方法或这是不可能的。
def get_id_udf (storeId,sqlContext):
df = sqlContext.read.json("file_url_s3")
if storeId == None:
return None
return None
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
desc_udf = udf(lambda storeId : get_id_udf(storeId,sqlContext), IntegerType())
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-22-b5c4070c110e> in <module>() 1 from pyspark.sql.functions import udf, col 2 from pyspark.sql.types import IntegerType ----> 3 desc_udf = udf(lambda storeId : get_cluster_id_udf(storeId,sqlContext), IntegerType()) /usr/lib/spark/python/pyspark/sql/functions.py in udf(f, returnType) 1799 [Row(slen=5), Row(slen=3)] 1800 """ -> 1801 return UserDefinedFunction(f, returnType) 1802 1803 blacklist = ['map', 'since', 'ignore_unicode_prefix'] /usr/lib/spark/python/pyspark/sql/functions.py in __init__(self, func, returnType, name) 1758 self.returnType = returnType 1759 self._broadcast = None -> 1760 self._judf = self._create_judf(name) 1761 1762 def _create_judf(self, name): /usr/lib/spark/python/pyspark/sql/functions.py in _create_judf(self, name) 1763 from pyspark.sql import SQLContext 1764 sc = SparkContext.getOrCreate() -> 1765 wrapped_func = _wrap_function(sc, self.func, self.returnType) 1766 ctx = SQLContext.getOrCreate(sc) 1767 jdt = ctx._ssql_ctx.parseDataType(self.returnType.json()) /usr/lib/spark/python/pyspark/sql/functions.py in _wrap_function(sc, func, returnType) 1743 def _wrap_function(sc, func, returnType): 1744 command = (func, returnType) -> 1745 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command) 1746 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
sc.pythonExec, 1747 sc.pythonVer,broadcast_vars,sc._javaAccumulator)
/usr/lib/spark/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command) 2313 # the serialized command will be compressed by broadcast 2314 ser = CloudPickleSerializer() -> 2315 pickled_command = ser.dumps(command) 2316 if len(pickled_command) > (1 << 20): # 1M 2317 # The broadcast will have same life cycle as created PythonRDD /usr/lib/spark/python/pyspark/serializers.py in dumps(self, obj) 426 427 def dumps(self, obj): --> 428 return cloudpickle.dumps(obj, 2) 429 430 /usr/lib/spark/python/pyspark/cloudpickle.py in dumps(obj, protocol) 655 656 cp = CloudPickler(file,protocol) --> 657 cp.dump(obj) 658 659 return file.getvalue() /usr/lib/spark/python/pyspark/cloudpickle.py in dump(self, obj) 105 self.inject_addons() 106 try: --> 107 return Pickler.dump(self, obj) 108 except RuntimeError as e: 109 if 'recursion' in e.args[0]: /usr/lib64/python2.7/pickle.pyc in dump(self, obj) 222 if self.proto >= 2: 223 self.write(PROTO + chr(self.proto)) --> 224 self.save(obj) 225 self.write(STOP) 226 /usr/lib64/python2.7/pickle.pyc in save(self, obj) 284 f = self.dispatch.get(t) 285 if f: --> 286 f(self, obj) # Call unbound method with explicit self 287 return 288 /usr/lib64/python2.7/pickle.pyc in save_dict(self, obj) 653 654 self.memoize(obj) --> 655 self._batch_setitems(obj.iteritems()) 656 657 dispatch[DictionaryType] = save_dict /usr/lib64/python2.7/pickle.pyc in _batch_setitems(self, items) 685 for k, v in tmp: 686 save(k) --> 687 save(v) 688 write(SETITEMS) 689 elif n: /usr/lib64/python2.7/pickle.pyc in save(self, obj) 304 reduce = getattr(obj, "__reduce_ex__", None) 305 if reduce: --> 306 rv = reduce(self.proto) 307 else: 308 reduce = getattr(obj, "__reduce__", None) TypeError: 'JavaPackage' object is not callable