我的数据框(df)包含大约850万行,每行具有一个点的纬度和经度。而且我还有 nodes_list 包含另外3000点的经度和纬度 我试图从 nodes_list 计算datframe (df)中每个点的最近点:
我运行以下功能并获得所需的结果,但是问题是当我尝试保存生成的df时,此任务花费了更多时间。这正常吗? :
def distance_GCS(lon1,lat1,lon2,lat2):
from math import sin, cos, sqrt, atan2, radians
R=6371.0088
lon1=radians(lon1)
lat1 = radians(lat1)
lon2 = radians(lon2)
lat2 = radians(lat2)
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c*1000 # m
return distance
def nearest_node(lon1,lat1): # nodes_list ['ID','Lon', 'lat']
distances_list=[]
for node in nodes_list:
lon2=node[1]
lat2=node[2]
z=distance_GCS(lon1,lat1,lon2,lat2)
x=[]
x+=[node[0],z]
distances_list+=[x]
nearest_node=min(distances_list, key=lambda x: x[1])
return nearest_node
nearest_udf_ID = fn.udf(lambda x,y: nearest_nodel(x,y)[0], typ.StringType())
nearest_udf_distance = fn.udf(lambda x,y: nearest_node(x,y)[1], typ.FloatType())
df_GCSDisatnce_neraest_node=df.withColumn('nearest_ID',nearest_udf_ID(fn.col('lon'),fn.col('lat'))).withColumn('distance_to_nearest_node',nearest_udf_distance(fn.col('lon'),fn.col('lat')))
df_GCSDisatnce_neraest_node.coalesce(1).write.parquet(r'D:\*****')