使用Pyspark查找最近的积分

时间:2018-04-19 16:23:55

标签: apache-spark pyspark apache-spark-sql spark-dataframe

coordinates = df0.registerTempTable('coordinates')  #5555 rows
metro_table = df1.registerTempTable('metro_table')  #272 rows

我试图找到离地铁站最近的点,因为我实现了这个SQL查询,但我不明白为什么Spark不接受 INNER JOIN !! !我的第二个问题是如何仅返回第一行!!我只使用获取前1行,但它给了我一个缩进错误

query = "SELECT uuid,\
            latitude,\
            longitude,\
            p.station_id,\
            p.xlat,\
            p.xlong,\
            p.type_train,\
            p.id_transport,\
              6371000* DEGREES(ACOS(COS(RADIANS(p.xlat))\
                     * COS(RADIANS(latitude))\
                     * COS(RADIANS(p.xlong) - RADIANS(longitude))\
                     + SIN(RADIANS(p.xlat))\
                     * SIN(RADIANS(latitude)))) AS distance_in_meters\
     FROM coordinates\
     CROSS JOIN (\
         SELECT id AS id_transport,\
                station_id,\
                xlat,\
                xlong,\
                type_train\
         FROM metro_table\
         fetch first 1 rows only\   # Doesn't work in Spark
        ) AS p ON 1=1\
     ORDER BY distance_in_meters"

# Run query
df = sqlContext.sql(query)   #1510960 rows

使用Pyspark(仅提取前1行)

w = Window.partitionBy(['uuid', 'latitude', 'longitude']).orderBy('distance_in_meters')
df.select('uuid', 'latitude', 'longitude', xlat, xlong, F.min('distance_in_meters').over(w)).count()  #1510960 rows

enter image description here

Merto
 ---+----------+----------------+----------+----------+
|id |xlong     |xlat            |station_id|type_train|
+---+----------+----------------+----------+----------+
|1  |-73.668172|45.5552769999931|1         |métro     |
|2  |-73.668486|45.5542469999931|2         |métro     |
|3  |-73.668225|45.5556069999931|3         |métro     |
|4  |-73.667407|45.5561219999931|4         |métro     |
+---+----------+----------------+----------+----------+

Coordinates
+-----+---------+----------+
|uuid | latitude| longitude|
+-----+---------+----------+
|1009 | 45.53175| -73.62613|
|1009 | 45.53163| -73.62546|
+-----+---------+----------+

After CROSS JOIN
 +----+--------+---------+----------+----------------+----------+----------+-- ----------------+
 |uuid|latitude|longitude|station_id|            xlat|     xlong|type_train|distance_in_meters|
 +----+--------+---------+----------+----------------+----------+----------+------------------+
 |1009|45.53175|-73.62613|         2|45.5542469999931|-73.668486|     metro|237197.13838255248|
 |1009|45.53163|-73.62546|         2|45.5542469999931|-73.668486|     metro|240044.33000560844|
 |1009|45.53175|-73.62613|         1|45.5552769999931|-73.668172|     metro| 240121.5093484111|
 |1009|45.53175|-73.62613|         4|45.5561219999931|-73.667407|     metro|240897.59082511123|
 |1009|45.53175|-73.62613|         3|45.5556069999931|-73.668225|     metro|241622.85492502493|
 |1009|45.53163|-73.62546|         1|45.5552769999931|-73.668172|     metro|242937.79388593792|
 |1009|45.53163|-73.62546|         4|45.5561219999931|-73.667407|     metro| 243679.8807249287|
 |1009|45.53163|-73.62546|         3|45.5556069999931|-73.668225|     metro| 244431.2963545028|
 +----+--------+---------+----------+----------------+----------+----------+------------------+

Desirable results
+----+--------+---------+----------+----------------+----------+----------+------------------+
|uuid|latitude|longitude|station_id|            xlat|     xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53175|-73.62613|         2|45.5542469999931|-73.668486|     metro|237197.13838255248|
|1009|45.53163|-73.62546|         2|45.5542469999931|-73.668486|     metro|240044.33000560844|

1 个答案:

答案 0 :(得分:1)

创建地铁站数据框

metro = [{'id': 1, 'xlong': -73.668172, 'xlat': 45.5552769999931, 'station_id': 1, 'type_train': 'metro'},
         {'id': 2, 'xlong': -73.668486, 'xlat': 45.5542469999931, 'station_id': 2, 'type_train': 'metro'},
         {'id': 3, 'xlong': -73.668225, 'xlat': 45.5556069999931, 'station_id': 3, 'type_train': 'metro'},
         {'id': 4, 'xlong': -73.667407, 'xlat': 45.5561219999931, 'station_id': 4, 'type_train': 'metro'}]
metroDF = spark.createDataFrame(metro)

创建coordnates数据框

coord = [{'uuid': 1009, 'latitude': 45.53175, 'longitude': -73.62613},
         {'uuid': 1009, 'latitude': 45.53163, 'longitude': -73.62546}]
coordDF = spark.createDataFrame(coord)

创建表

coordinates = coordDF.registerTempTable('coordinates')
metro_table = metroDF.registerTempTable('metro_table')

使用CROSS JOIN

加入数据帧
query = "SELECT uuid,\
        latitude,\
        longitude,\
        p.station_id,\
        p.xlat,\
        p.xlong,\
        p.type_train,\
          6371000* DEGREES(ACOS(COS(RADIANS(p.xlat))\
                 * COS(RADIANS(latitude))\
                 * COS(RADIANS(p.xlong) - RADIANS(longitude))\
                 + SIN(RADIANS(p.xlat))\
                 * SIN(RADIANS(latitude)))) AS distance_in_meters\
    FROM coordinates\
    CROSS JOIN (\
    SELECT id AS id_transport,\
            station_id,\
            xlat,\
            xlong,\
            type_train\
     FROM metro_table\
    ) AS p ON 1=1\
 ORDER BY distance_in_meters"

执行查询

 df = sqlContext.sql(query) 

最后为每个点选择最近的车站

 from pyspark.sql.window import Window
 from pyspark.sql.functions import *

 w = Window.partitionBy(['uuid', 'latitude', 'longitude']).orderBy('distance_in_meters')
 dfTop = df.withColumn("rn", row_number().over(w)).where(col('rn') == 1).drop("rn")

 dfTop.show()

+----+--------+---------+----------+----------------+----------+----------+------------------+
|uuid|latitude|longitude|station_id|            xlat|     xlong|type_train|distance_in_meters|
+----+--------+---------+----------+----------------+----------+----------+------------------+
|1009|45.53163|-73.62546|         2|45.5542469999931|-73.668486|     metro|240044.33000560844|
|1009|45.53175|-73.62613|         2|45.5542469999931|-73.668486|     metro|237197.13838255248|
+----+--------+---------+----------+----------------+----------+----------+------------------+