从python列表计算数据帧列的距离

时间:2017-07-31 20:33:02

标签: apache-spark pyspark

我有一个数据框

data=sqlContext.createDataFrame([[33.603699,-83.967819[43.609422,-84.188726],[40.751800537,-74.066200256]],['a','b'])

我有一个lat / lon对列表。对于数据中的每个纬度/经度对,我想计算每个纬度/经度之间的距离 在列表中配对。我使用代码形式这个答案作为我的距离函数

How to sum distances between data points in a dataset using (Py)Spark?

lat_lon_list=[[26.145677, -80.120355],[26.179337, -80.25151600000001],[26.188919, -98.21469499999999], [26.641769, -81.875031]]


def dist_2(long_x, lat_x, long_y, lat_y):
    z0=np.sin(np.radians(lat_y))
    z1=np.cos(np.radians(lat_y))
    z3=np.radians(long_y)

    return F.acos(F.sin(F.toRadians(F.col(long_x)) * z0 + \
    F.cos(F.toRadians(F.col(lat_x))) * z1 * \
        F.cos(F.toRadians(F.col(long_x))) - z3\
    ) * F.lit((6371.0)*(0.621371)))

def dist_1(x,y):
    return [dist_2(x,y,c[0],c[1]) for c in lat_lon_list]

当我尝试计算距离时,我得到以下错误

data.select('a','b',dist_1('a','b')).show()



TypeErrorTraceback (most recent call last)
<ipython-input-53-8ec09912a7b1> in <module>()
     24 
     25 
 ---> 26 data.select('a','b',dist_1('a','b')).show()

/opt/spark/current/python/pyspark/sql/dataframe.py in select(self, 
*cols)
    859         [Row(name=u'Alice', age=12), Row(name=u'Bob', 
age=15)]
    860         """
--> 861         jdf = self._jdf.select(self._jcols(*cols))
    862         return DataFrame(jdf, self.sql_ctx)
    863 

/opt/spark/current/python/pyspark/sql/dataframe.py in _jcols(self, 
*cols)
    714         if len(cols) == 1 and isinstance(cols[0], list):
    715             cols = cols[0]
--> 716         return self._jseq(cols, _to_java_column)
    717 
    718     def _sort_cols(self, cols, kwargs):

/opt/spark/current/python/pyspark/sql/dataframe.py in _jseq(self, 
cols, converter)
    701     def _jseq(self, cols, converter=None):
    702         """Return a JVM Seq of Columns from a list of Column 
or names"""
--> 703         return _to_seq(self.sql_ctx._sc, cols, converter)
    704 
    705     def _jmap(self, jm):

/opt/spark/current/python/pyspark/sql/column.py in _to_seq(sc, cols, 
converter)
     57     """
     58     if converter:
---> 59         cols = [converter(c) for c in cols]
     60     return sc._jvm.PythonUtils.toSeq(cols)
     61 

/opt/spark/current/python/pyspark/sql/column.py in 
_to_java_column(col)
     45         jcol = col._jc
     46     else:
---> 47         jcol = _create_column_from_name(col)
     48     return jcol
     49 

/opt/spark/current/python/pyspark/sql/column.py in 
_create_column_from_name(name)
     38 def _create_column_from_name(name):
     39     sc = SparkContext._active_spark_context
---> 40     return sc._jvm.functions.col(name)
     41 
     42 

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in __call__(self, *args)
   1122 
   1123     def __call__(self, *args):
-> 1124         args_command, temp_args = self._build_args(*args)
   1125 
   1126         command = proto.CALL_COMMAND_NAME +\

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _build_args(self, *args)
   1086     def _build_args(self, *args):
   1087         if self.converters is not None and 
len(self.converters) > 0:
-> 1088             (new_args, temp_args) = self._get_args(args)
   1089         else:
   1090             new_args = args

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _get_args(self, args)
   1073                 for converter in 
    self.gateway_client.converters:
       1074                     if converter.can_convert(arg):
-> 1075                         temp_arg = converter.convert(arg, 
self.gateway_client)
   1076                         temp_args.append(temp_arg)
   1077                         new_args.append(temp_arg)

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_collections.py in convert(self, object, 
gateway_client)
    499         java_list = ArrayList()
    500         for element in object:
--> 501             java_list.add(element)
    502         return java_list
    503 

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in __call__(self, *args)
   1122 
   1123     def __call__(self, *args):
-> 1124         args_command, temp_args = self._build_args(*args)
   1125 
   1126         command = proto.CALL_COMMAND_NAME +\

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _build_args(self, *args)
   1086     def _build_args(self, *args):
   1087         if self.converters is not None and 
len(self.converters) > 0:
-> 1088             (new_args, temp_args) = self._get_args(args)
   1089         else:
   1090             new_args = args

/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _get_args(self, args)
   1073                 for converter in 
self.gateway_client.converters:
   1074                     if converter.can_convert(arg):
-> 1075                         temp_arg = converter.convert(arg, 
self.gateway_client)
   1076                         temp_args.append(temp_arg)
   1077                         new_args.append(temp_arg)

/opt/spark/current/python/lib/py4j-0.10.3-
 src.zip/py4j/java_collections.py in convert(self, object, 
gateway_client)
    510         HashMap = JavaClass("java.util.HashMap", 
gateway_client)
    511         java_map = HashMap()
--> 512         for key in object.keys():
    513             java_map[key] = object[key]
    514         return java_map

TypeError: 'Column' object is not callable

任何帮助都将不胜感激。

1 个答案:

答案 0 :(得分:0)

这是因为你的函数返回一个列表。你可以解压缩:

data.select('a','b', *dist_1('a','b'))

或合并:

data.select(['a','b'] + dist_1('a','b'))