coordinates = df0.registerTempTable('coordinates') #5555 rows
metro_table = df1.registerTempTable('metro_table') #272 rows
我试图找到离地铁站最近的点,因为我实现了这个SQL查询,但我不明白为什么Spark不接受 INNER JOIN !! !我的第二个问题是如何仅返回第一行!!我只使用获取前1行,但它给了我一个缩进错误
query = "SELECT uuid,\
latitude,\
longitude,\
p.station_id,\
p.xlat,\
p.xlong,\
p.type_train,\
p.id_transport,\
6371000* DEGREES(ACOS(COS(RADIANS(p.xlat))\
* COS(RADIANS(latitude))\
* COS(RADIANS(p.xlong) - RADIANS(longitude))\
+ SIN(RADIANS(p.xlat))\
* SIN(RADIANS(latitude)))) AS distance_in_meters\
FROM coordinates\
CROSS JOIN (\
SELECT id AS id_transport,\
station_id,\
xlat,\
xlong,\
type_train\
FROM metro_table\
fetch first 1 rows only\ # Doesn't work in Spark
) AS p ON 1=1\
ORDER BY distance_in_meters"
# Run query
df = sqlContext.sql(query) #1510960 rows
使用Pyspark(仅提取前1行)
w = Window.partitionBy(['uuid', 'latitude', 'longitude']).orderBy('distance_in_meters')
df.select('uuid', 'latitude', 'longitude', xlat, xlong, F.min('distance_in_meters').over(w)).count() #1510960 rows
Merto
---+----------+----------------+----------+----------+
|id |xlong |xlat |station_id|type_train|
+---+----------+----------------+----------+----------+
|1 |-73.668172|45.5552769999931|1 |métro |
|2 |-73.668486|45.5542469999931|2 |métro |
|3 |-73.668225|45.5556069999931|3 |métro |
|4 |-73.667407|45.5561219999931|4 |métro |
+---+----------+----------------+----------+----------+
Coordinates
+-----+---------+----------+
|uuid | latitude| longitude|
+-----+---------+----------+
|1009 | 45.53175| -73.62613|
|1009 | 45.53163| -73.62546|
+-----+---------+----------+
After CROSS JOIN
+----+--------+---------+----------+----------------+----------+----------+-- ----------------+
|uuid|latitude|longitude|station_id| xlat| xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53175|-73.62613| 2|45.5542469999931|-73.668486| metro|237197.13838255248|
|1009|45.53163|-73.62546| 2|45.5542469999931|-73.668486| metro|240044.33000560844|
|1009|45.53175|-73.62613| 1|45.5552769999931|-73.668172| metro| 240121.5093484111|
|1009|45.53175|-73.62613| 4|45.5561219999931|-73.667407| metro|240897.59082511123|
|1009|45.53175|-73.62613| 3|45.5556069999931|-73.668225| metro|241622.85492502493|
|1009|45.53163|-73.62546| 1|45.5552769999931|-73.668172| metro|242937.79388593792|
|1009|45.53163|-73.62546| 4|45.5561219999931|-73.667407| metro| 243679.8807249287|
|1009|45.53163|-73.62546| 3|45.5556069999931|-73.668225| metro| 244431.2963545028|
+----+--------+---------+----------+----------------+----------+----------+------------------+
Desirable results
+----+--------+---------+----------+----------------+----------+----------+------------------+
|uuid|latitude|longitude|station_id| xlat| xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53175|-73.62613| 2|45.5542469999931|-73.668486| metro|237197.13838255248|
|1009|45.53163|-73.62546| 2|45.5542469999931|-73.668486| metro|240044.33000560844|
答案 0 :(得分:1)
创建地铁站数据框
metro = [{'id': 1, 'xlong': -73.668172, 'xlat': 45.5552769999931, 'station_id': 1, 'type_train': 'metro'},
{'id': 2, 'xlong': -73.668486, 'xlat': 45.5542469999931, 'station_id': 2, 'type_train': 'metro'},
{'id': 3, 'xlong': -73.668225, 'xlat': 45.5556069999931, 'station_id': 3, 'type_train': 'metro'},
{'id': 4, 'xlong': -73.667407, 'xlat': 45.5561219999931, 'station_id': 4, 'type_train': 'metro'}]
metroDF = spark.createDataFrame(metro)
创建coordnates数据框
coord = [{'uuid': 1009, 'latitude': 45.53175, 'longitude': -73.62613},
{'uuid': 1009, 'latitude': 45.53163, 'longitude': -73.62546}]
coordDF = spark.createDataFrame(coord)
创建表
coordinates = coordDF.registerTempTable('coordinates')
metro_table = metroDF.registerTempTable('metro_table')
使用CROSS JOIN
加入数据帧query = "SELECT uuid,\
latitude,\
longitude,\
p.station_id,\
p.xlat,\
p.xlong,\
p.type_train,\
6371000* DEGREES(ACOS(COS(RADIANS(p.xlat))\
* COS(RADIANS(latitude))\
* COS(RADIANS(p.xlong) - RADIANS(longitude))\
+ SIN(RADIANS(p.xlat))\
* SIN(RADIANS(latitude)))) AS distance_in_meters\
FROM coordinates\
CROSS JOIN (\
SELECT id AS id_transport,\
station_id,\
xlat,\
xlong,\
type_train\
FROM metro_table\
) AS p ON 1=1\
ORDER BY distance_in_meters"
执行查询
df = sqlContext.sql(query)
最后为每个点选择最近的车站
from pyspark.sql.window import Window
from pyspark.sql.functions import *
w = Window.partitionBy(['uuid', 'latitude', 'longitude']).orderBy('distance_in_meters')
dfTop = df.withColumn("rn", row_number().over(w)).where(col('rn') == 1).drop("rn")
dfTop.show()
+----+--------+---------+----------+----------------+----------+----------+------------------+
|uuid|latitude|longitude|station_id| xlat| xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53163|-73.62546| 2|45.5542469999931|-73.668486| metro|240044.33000560844|
|1009|45.53175|-73.62613| 2|45.5542469999931|-73.668486| metro|237197.13838255248|
+----+--------+---------+----------+----------------+----------+----------+------------------+