注意:此问题是上一个问题:r - Finding closest coordinates between two large data sets 的后续操作。
我旨在基于两个数据集中的坐标来确定数据集中2中与数据集中1中每个条目最近的条目。数据集1包含180,000行(仅1,800个唯一坐标),数据集2包含4,500行(完整的4,500个唯一坐标)。先前引用的帖子中包含一个解决问题的方法,但是它使用RANN::nn2
并使用欧几里得距离,而不是使用椭球/ Vincenty。
当前代码:
df1[ , c(4,5)] <- as.data.frame(RANN::nn2(df2[,c(2,3)],df1[,c(2,3)],k=1))
df1[,4] <- df2[df1[, 4], 1]
# id HIGH_PRCN_LAT HIGH_PRCN_LON SRC_ID distance
# 1 1 52.88144 -2.873778 44 0.7990743
# 2 2 57.80945 -2.234544 5688 2.1676868
# 3 4 34.02335 -3.098445 61114 1.4758202
# 4 5 63.80879 -2.439163 23 4.2415854
# 5 6 53.68881 -7.396112 54 3.6445416
# 6 7 63.44628 -5.162345 23 2.3577811
# 7 8 21.60755 -8.633113 440 8.2123762
# 8 9 78.32444 3.813290 76 11.4936496
# 9 10 66.85533 -3.994326 55 1.9296370
# 10 3 51.62354 -8.906553 54 3.2180026
我怀疑该解决方案将涉及geosphere::distVincentyEllipsoid
,但不确定如何将其集成到现有代码中。
r详细信息
platform x86_64-w64-mingw32
version.string R version 3.5.3 (2019-03-11)
数据集1输入(不缩小为唯一坐标)
df1 <- structure(list(id = c(1L, 2L, 4L, 5L,
6L, 7L, 8L, 9, 10L, 3L),
HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529,
63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207,
78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822,
-2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454,
-5.162345043546359, -8.63311254098095, 3.813289888829932,
-3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")
数据集2输入
df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L,
5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432,
42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062,
-6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA,
10L), class = "data.frame")
答案 0 :(得分:1)
使用 distVincentyEllipsoid
功能:
library(geosphere)
t(
apply(
apply(df1[,c(3,2)], 1, function(mrow){distVincentyEllipsoid(mrow, df2[,c(3,2)])}),
2, function(x){ c(SRC_ID=df2[which.min(x),1],distance=min(x))}
)
)
SRC_ID distance
1 44 74680.48
2 5688 238553.51
3 61114 137385.18
4 23 340642.70
5 44 308458.73
6 23 256176.88
7 440 908292.28
8 76 1064419.47
9 55 185119.29
10 54 251580.45
只需使用df1[,c(4,5)] <- t(apply(...
即可将值分配给df1
的列
使用rgeos::gDistance
。这是笛卡尔距离,但是从下面的解决方案开始,我设法在上面发布了更新的答案;
library(sp);library(rgeos)
#convert to spatial datasets
df1rgsp <- SpatialPointsDataFrame(df1[,c(3,2)], df1[,-c(3,2)])
df2rgsp <- SpatialPointsDataFrame(df2[,c(3,2)], data.frame(SRC_ID=df2[,1]))
#apply it on each rows
#find the minimum value and the corresponding row number
#transform it to become to columns and assign it to the columns of `df1`
df1[,c(4,5)] <- t( apply(gDistance(df1rgsp, df2rgsp, byid=TRUE), 1, function(x){
c(SRC_ID=which.min(x),distance=min(x))}))
#replace row numbers with `SRC_ID
df1[,4] <- df2[as.integer(df1[, 4]), 1] #same as what you have in the Q
# id HIGH_PRCN_LAT HIGH_PRCN_LON SRC_ID distance
# 1 1 52.88144 -2.873778 440 1.9296370
# 2 2 57.80945 -2.234544 61114 3.2180026
# 3 4 34.02335 -3.098445 21 2.3577811
# 4 5 63.80879 -2.439163 23 8.8794997
# 5 6 53.68881 -7.396112 55 0.7990743
# 6 7 63.44628 -5.162345 440 3.4316239
# 7 8 21.60755 -8.633113 5688 11.4936496
# 8 9 78.32444 3.813290 54 2.1676868
# 9 10 66.85533 -3.994326 23 6.1545391
# 10 3 51.62354 -8.906553 23 1.4758202