我正在分析我的Google位置记录(如果有人感兴趣,请从here转储)。
现在,数据集不包含任何包含城市名称的字段,但是只要每行都有纬度/经度组合,我们就可以自己计算。
鉴于我的数据集长为120万行,因此无法使用免费的地理编码API(流量明显受到限制)。
耦合机场位置
airport_coords <-
structure(
list(
V1 = c("LIMC", "LIRF"),
V2 = c("MXP", "FCO"),
V3 = c("MALPENSA", "FIUMICINO"),
V4 = c("MILANO", "ROME"),
V5 = c("ITALY", "ITALY"),
V6 = c(45L, 41L),
V7 = c(37L, 48L),
V8 = c(53L, 46L),
V9 = c("N", "N"),
V10 = c(8L, 12L),
V11 = c(43L, 15L),
V12 = c(40L, 11L),
V13 = c("E", "E"),
V14 = c(234L, 4L),
V15 = c(45.631, 41.813),
V16 = c(8.728,
12.253)
),
row.names = c(NA,-2L),
class = "data.frame"
)
这是Google位置记录的简化版本的几行内容
loc_history <-
structure(list(latitudeGPS = c(41.8713521, 41.8713478, 41.8714064,
41.8714201, 41.8713419, 41.8713981, 41.8713237, 41.8714538, 41.8713845,
41.8714139, 41.8714417, 41.8714538, 41.8714417, 41.8714538, 41.8714538,
41.8714538, 41.8714538, 41.8714538, 41.8714594, 41.8714594),
longitudeGPS = c(12.4414861, 12.441478, 12.4415342, 12.4415539,
12.4414757, 12.4415345, 12.4414538, 12.4415871, 12.441514,
12.4415466, 12.4415735, 12.4415871, 12.4415735, 12.4415871,
12.4415871, 12.4415871, 12.4415871, 12.4415871, 12.4415954,
12.4415954)), row.names = c(NA, 20L), class = "data.frame")
我的方法是计算经纬度坐标和我对地理编码感兴趣的城市的机场之间的距离(来自this数据集),假设如果距离小于50 km,在机场所在的城市(对于我的需求应该足够准确)。
我编写了下面的for
循环(我知道...),尽管循环非常慢。我正在寻找使用apply
系列之类的矢量化函数将垃圾箱火化为更快速度的方法。
library(raster) # for pointDistance
library(dplyr)
# Init empty df to store results
dist <- data.frame(
dist_mt = NA,
city = NA
)
for (i in 1:nrow(loc_history)) {
# Tmp df to store computed distances
tmp <- data.frame(
dist_mt = NA,
city = NA
)
for (x in 1:nrow(airport_coords)) {
# Coompute point - airport distance
v <- pointDistance(c(data[i,]$latitudeGPS,
data[i,]$longitudeGPS),
c(airport_coords[x,]$V15,
airport_coords[x,]$V16),
lonlat = TRUE)
# Append to tmp dataframe
tmp[x,]$dist_mt <- v
tmp[x,]$city <- airport_coords[x,]$V4 # Keep city label
}
# Append city if distance < 50km
if (min(tmp$dist_mt) <= 50000) {
dist[i,] <- filter(tmp, dist_mt == min(dist_mt))
} else {
dist[i,]$city <- "other"
}
}
循环大约需要4秒钟来处理〜1.0000行。有120万行,运行它大约需要80分钟。
答案 0 :(得分:2)
尝试使用sf
和lwgeom
软件包:
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
airport_coords = st_as_sf(airport_coords, coords=c('V16', 'V15'), crs=4326)
loc_history = st_as_sf(loc_history, coords=c('longitudeGPS', 'latitudeGPS'), crs=4326)
dist = st_distance(loc_history, airport_coords)
dist
#> Units: [m]
#> [,1] [,2]
#> [1,] 513625.5 16943.33
#> [2,] 513625.5 16942.53
#> [3,] 513622.8 16949.33
#> [4,] 513622.4 16951.42
#> [5,] 513625.9 16942.10
#> [6,] 513623.5 16949.00
#> [7,] 513626.6 16939.65
#> [8,] 513620.9 16955.40
#> [9,] 513623.8 16946.85
#> [10,] 513622.6 16950.60
#> [11,] 513621.4 16953.84
#> [12,] 513620.9 16955.40
#> [13,] 513621.4 16953.84
#> [14,] 513620.9 16955.40
#> [15,] 513620.9 16955.40
#> [16,] 513620.9 16955.40
#> [17,] 513620.9 16955.40
#> [18,] 513620.9 16955.40
#> [19,] 513620.8 16956.27
#> [20,] 513620.8 16956.27
closest = apply(dist, 1,
function(r) ifelse(min(r)<=50000, airport_coords$V4[which.min(r)], NA))
由reprex package(v0.3.0)于2020-01-15创建
答案 1 :(得分:1)
您需要根据数据创建矩阵,而不是根据pointDistance
帮助文档一次传递一个值:
参数
p1 第一个点(组)的x和y坐标,可以是c(x,y),矩阵(ncol = 2)或SpatialPoints *。
p2 第二(第二个)点的x和y坐标(类似于p1)。如果缺少此参数,则为p1计算距离矩阵
所以要一口气把整个事情都搞定,
pointDistance(
matrix(c(loc_history$longitudeGPS, loc_history$latitudeGPS), ncol=2),
matrix(c(airport_coords$V16, airport_coords$V15), ncol =2),
lonlat = TRUE) -> distmat
distmat
#> [,1] [,2]
#> [1,] 513625.5 16943.33
#> [2,] 513625.5 16942.53
#> [3,] 513622.8 16949.33
#> [4,] 513622.4 16951.42
#> [5,] 513625.9 16942.10
#> [6,] 513623.5 16949.00
#> [7,] 513626.6 16939.65
#> [8,] 513620.9 16955.40
#> [9,] 513623.8 16946.85
#> [10,] 513622.6 16950.60
#> [11,] 513621.4 16953.84
#> [12,] 513620.9 16955.40
#> [13,] 513621.4 16953.84
#> [14,] 513620.9 16955.40
#> [15,] 513620.9 16955.40
#> [16,] 513620.9 16955.40
#> [17,] 513620.9 16955.40
#> [18,] 513620.9 16955.40
#> [19,] 513620.8 16956.27
#> [20,] 513620.8 16956.27
由于每列代表到每个机场的距离(按照它们在机场数据框中显示的顺序),因此,如果从每一行中找到最小值,则将找到机场的索引。您可以使用apply
loc_history$nearest_airport <- apply(distmat, 1, function(x)
{ if(x[which.min(x)] < 50000) airport_coords$V4[which.min(x)] else NA })
loc_history$distance_to_nearest_airport <- apply(distmat, 1, min)
这应该是您想要的结果:
loc_history
#> latitudeGPS longitudeGPS nearest_airport distance_to_nearest_airport
#> 1 41.87135 12.44149 ROME 16943.33
#> 2 41.87135 12.44148 ROME 16942.53
#> 3 41.87141 12.44153 ROME 16949.33
#> 4 41.87142 12.44155 ROME 16951.42
#> 5 41.87134 12.44148 ROME 16942.10
#> 6 41.87140 12.44153 ROME 16949.00
#> 7 41.87132 12.44145 ROME 16939.65
#> 8 41.87145 12.44159 ROME 16955.40
#> 9 41.87138 12.44151 ROME 16946.85
#> 10 41.87141 12.44155 ROME 16950.60
#> 11 41.87144 12.44157 ROME 16953.84
#> 12 41.87145 12.44159 ROME 16955.40
#> 13 41.87144 12.44157 ROME 16953.84
#> 14 41.87145 12.44159 ROME 16955.40
#> 15 41.87145 12.44159 ROME 16955.40
#> 16 41.87145 12.44159 ROME 16955.40
#> 17 41.87145 12.44159 ROME 16955.40
#> 18 41.87145 12.44159 ROME 16955.40
#> 19 41.87146 12.44160 ROME 16956.27
#> 20 41.87146 12.44160 ROME 16956.27
如果50公里之内没有机场,则应该在Nearest_airport栏中输入NA
。
换句话说,您可以将整个“垃圾场之火”替换为:
distmat <- pointDistance(
matrix(c(loc_history$longitudeGPS, loc_history$latitudeGPS), ncol=2),
matrix(c(airport_coords$V16, airport_coords$V15), ncol =2),
lonlat = TRUE)
loc_history$nearest_airport <- apply(distmat, 1, function(x)
{ if(x[which.min(x)] < 50000) airport_coords$V4[which.min(x)] else NA })
loc_history$distance_to_nearest_airport <- apply(distmat, 1, min)