我有一个数据框
data=sqlContext.createDataFrame([[33.603699,-83.967819[43.609422,-84.188726],[40.751800537,-74.066200256]],['a','b'])
我有一个lat / lon对列表。对于数据中的每个纬度/经度对,我想计算每个纬度/经度之间的距离 在列表中配对。我使用代码形式这个答案作为我的距离函数
How to sum distances between data points in a dataset using (Py)Spark?
lat_lon_list=[[26.145677, -80.120355],[26.179337, -80.25151600000001],[26.188919, -98.21469499999999], [26.641769, -81.875031]]
def dist_2(long_x, lat_x, long_y, lat_y):
z0=np.sin(np.radians(lat_y))
z1=np.cos(np.radians(lat_y))
z3=np.radians(long_y)
return F.acos(F.sin(F.toRadians(F.col(long_x)) * z0 + \
F.cos(F.toRadians(F.col(lat_x))) * z1 * \
F.cos(F.toRadians(F.col(long_x))) - z3\
) * F.lit((6371.0)*(0.621371)))
def dist_1(x,y):
return [dist_2(x,y,c[0],c[1]) for c in lat_lon_list]
当我尝试计算距离时,我得到以下错误
data.select('a','b',dist_1('a','b')).show()
TypeErrorTraceback (most recent call last)
<ipython-input-53-8ec09912a7b1> in <module>()
24
25
---> 26 data.select('a','b',dist_1('a','b')).show()
/opt/spark/current/python/pyspark/sql/dataframe.py in select(self,
*cols)
859 [Row(name=u'Alice', age=12), Row(name=u'Bob',
age=15)]
860 """
--> 861 jdf = self._jdf.select(self._jcols(*cols))
862 return DataFrame(jdf, self.sql_ctx)
863
/opt/spark/current/python/pyspark/sql/dataframe.py in _jcols(self,
*cols)
714 if len(cols) == 1 and isinstance(cols[0], list):
715 cols = cols[0]
--> 716 return self._jseq(cols, _to_java_column)
717
718 def _sort_cols(self, cols, kwargs):
/opt/spark/current/python/pyspark/sql/dataframe.py in _jseq(self,
cols, converter)
701 def _jseq(self, cols, converter=None):
702 """Return a JVM Seq of Columns from a list of Column
or names"""
--> 703 return _to_seq(self.sql_ctx._sc, cols, converter)
704
705 def _jmap(self, jm):
/opt/spark/current/python/pyspark/sql/column.py in _to_seq(sc, cols,
converter)
57 """
58 if converter:
---> 59 cols = [converter(c) for c in cols]
60 return sc._jvm.PythonUtils.toSeq(cols)
61
/opt/spark/current/python/pyspark/sql/column.py in
_to_java_column(col)
45 jcol = col._jc
46 else:
---> 47 jcol = _create_column_from_name(col)
48 return jcol
49
/opt/spark/current/python/pyspark/sql/column.py in
_create_column_from_name(name)
38 def _create_column_from_name(name):
39 sc = SparkContext._active_spark_context
---> 40 return sc._jvm.functions.col(name)
41
42
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in __call__(self, *args)
1122
1123 def __call__(self, *args):
-> 1124 args_command, temp_args = self._build_args(*args)
1125
1126 command = proto.CALL_COMMAND_NAME +\
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _build_args(self, *args)
1086 def _build_args(self, *args):
1087 if self.converters is not None and
len(self.converters) > 0:
-> 1088 (new_args, temp_args) = self._get_args(args)
1089 else:
1090 new_args = args
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _get_args(self, args)
1073 for converter in
self.gateway_client.converters:
1074 if converter.can_convert(arg):
-> 1075 temp_arg = converter.convert(arg,
self.gateway_client)
1076 temp_args.append(temp_arg)
1077 new_args.append(temp_arg)
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_collections.py in convert(self, object,
gateway_client)
499 java_list = ArrayList()
500 for element in object:
--> 501 java_list.add(element)
502 return java_list
503
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in __call__(self, *args)
1122
1123 def __call__(self, *args):
-> 1124 args_command, temp_args = self._build_args(*args)
1125
1126 command = proto.CALL_COMMAND_NAME +\
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _build_args(self, *args)
1086 def _build_args(self, *args):
1087 if self.converters is not None and
len(self.converters) > 0:
-> 1088 (new_args, temp_args) = self._get_args(args)
1089 else:
1090 new_args = args
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_gateway.py in _get_args(self, args)
1073 for converter in
self.gateway_client.converters:
1074 if converter.can_convert(arg):
-> 1075 temp_arg = converter.convert(arg,
self.gateway_client)
1076 temp_args.append(temp_arg)
1077 new_args.append(temp_arg)
/opt/spark/current/python/lib/py4j-0.10.3-
src.zip/py4j/java_collections.py in convert(self, object,
gateway_client)
510 HashMap = JavaClass("java.util.HashMap",
gateway_client)
511 java_map = HashMap()
--> 512 for key in object.keys():
513 java_map[key] = object[key]
514 return java_map
TypeError: 'Column' object is not callable
任何帮助都将不胜感激。
答案 0 :(得分:0)
这是因为你的函数返回一个列表。你可以解压缩:
data.select('a','b', *dist_1('a','b'))
或合并:
data.select(['a','b'] + dist_1('a','b'))