如何在spark udf中使用sqlContext

时间:2017-10-18 20:11:17

标签: pyspark user-defined-functions

我试图在spark UDF中加载json文件并使用它来查询某些内容。我需要做的是使用数据框中的列值(storeId)并在

中使用它

但是,我得到了一个泡菜错误。如果我在没有sqlContext的情况下尝试编码,那么它可以工作。

是否有任何解决方法或这是不可能的。

def get_id_udf (storeId,sqlContext):

   df = sqlContext.read.json("file_url_s3")

   if storeId == None:
       return None

   return None

from pyspark.sql.functions import udf, col

from pyspark.sql.types import IntegerType

desc_udf = udf(lambda storeId : get_id_udf(storeId,sqlContext),       IntegerType())
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-22-b5c4070c110e> in <module>()
      1 from pyspark.sql.functions import udf, col
      2 from pyspark.sql.types import IntegerType
----> 3 desc_udf = udf(lambda storeId : get_cluster_id_udf(storeId,sqlContext), IntegerType())

/usr/lib/spark/python/pyspark/sql/functions.py in udf(f, returnType)
   1799     [Row(slen=5), Row(slen=3)]
   1800     """
-> 1801     return UserDefinedFunction(f, returnType)
   1802 
   1803 blacklist = ['map', 'since', 'ignore_unicode_prefix']

/usr/lib/spark/python/pyspark/sql/functions.py in __init__(self, func, returnType, name)
   1758         self.returnType = returnType
   1759         self._broadcast = None
-> 1760         self._judf = self._create_judf(name)
   1761 
   1762     def _create_judf(self, name):

/usr/lib/spark/python/pyspark/sql/functions.py in _create_judf(self, name)
   1763         from pyspark.sql import SQLContext
   1764         sc = SparkContext.getOrCreate()
-> 1765         wrapped_func = _wrap_function(sc, self.func, self.returnType)
   1766         ctx = SQLContext.getOrCreate(sc)
   1767         jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())

/usr/lib/spark/python/pyspark/sql/functions.py in _wrap_function(sc, func, returnType)
   1743 def _wrap_function(sc, func, returnType):
   1744     command = (func, returnType)
-> 1745     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
   1746     return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes,
     

sc.pythonExec,          1747 sc.pythonVer,broadcast_vars,sc._javaAccumulator)

/usr/lib/spark/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command)
   2313     # the serialized command will be compressed by broadcast
   2314     ser = CloudPickleSerializer()
-> 2315     pickled_command = ser.dumps(command)
   2316     if len(pickled_command) > (1 << 20):  # 1M
   2317         # The broadcast will have same life cycle as created PythonRDD

/usr/lib/spark/python/pyspark/serializers.py in dumps(self, obj)
    426 
    427     def dumps(self, obj):
--> 428         return cloudpickle.dumps(obj, 2)
    429 
    430 

/usr/lib/spark/python/pyspark/cloudpickle.py in dumps(obj, protocol)
    655 
    656     cp = CloudPickler(file,protocol)
--> 657     cp.dump(obj)
    658 
    659     return file.getvalue()

/usr/lib/spark/python/pyspark/cloudpickle.py in dump(self, obj)
    105         self.inject_addons()
    106         try:
--> 107             return Pickler.dump(self, obj)
    108         except RuntimeError as e:
    109             if 'recursion' in e.args[0]:

/usr/lib64/python2.7/pickle.pyc in dump(self, obj)
    222         if self.proto >= 2:
    223             self.write(PROTO + chr(self.proto))
--> 224         self.save(obj)
    225         self.write(STOP)
    226 


/usr/lib64/python2.7/pickle.pyc in save(self, obj)
    284         f = self.dispatch.get(t)
    285         if f:
--> 286             f(self, obj) # Call unbound method with explicit self
    287             return
    288 

/usr/lib64/python2.7/pickle.pyc in save_dict(self, obj)
    653 
    654         self.memoize(obj)
--> 655         self._batch_setitems(obj.iteritems())
    656 
    657     dispatch[DictionaryType] = save_dict

/usr/lib64/python2.7/pickle.pyc in _batch_setitems(self, items)
    685                 for k, v in tmp:
    686                     save(k)
--> 687                     save(v)
    688                 write(SETITEMS)
    689             elif n:

/usr/lib64/python2.7/pickle.pyc in save(self, obj)
    304             reduce = getattr(obj, "__reduce_ex__", None)
    305             if reduce:
--> 306                 rv = reduce(self.proto)
    307             else:
    308                 reduce = getattr(obj, "__reduce__", None)

TypeError: 'JavaPackage' object is not callable

0 个答案:

没有答案