我想计算假冒数据集中两个链接的空间坐标集(program
和admin
)之间的距离。数据格式宽,因此两对坐标都在同一行。
library(sp)
set.seed(1)
n <- 100
program.id <- seq(1, n)
c1 <- cbind(runif(n, -90, 90), runif(n, -180, 180))
c2 <- cbind(runif(n, -90, 90), runif(n, -180, 180))
dat <- data.frame(cbind(program.id, c1, c2))
names(dat) <- c("program.id", "program.lat", "program.long", "admin.lat", "admin.long")
head(dat)
# program.id program.lat program.long admin.lat admin.long
# 1 1 -42.20844 55.70061 -41.848523 62.536404
# 2 2 -23.01770 -52.84898 -50.643849 -145.851172
# 3 3 13.11361 -82.70635 3.023431 -2.665397
# 4 4 73.47740 177.36626 -41.588893 -13.841337
# 5 5 -53.69725 48.05758 -57.389701 -44.922049
# 6 6 71.71014 -103.24507 3.343705 176.795719
我知道如何使用program
包在admin
或sp
之间创建距离矩阵:
ll <- c("program.lat", "program.long")
coords <- dat[ll]
dist <- apply(coords, 1,
function(eachPoint) spDistsN1(as.matrix(coords),
eachPoint, longlat=TRUE))
但我想要做的是在每对坐标之间创建一个距离(dist.km
)的nx1向量,并将其添加到dat
。
# program.id program.lat program.long admin.lat admin.long dist.km
# 1 1 -42.20844 55.70061 -41.848523 62.536404 567.35
# 2 2 -23.01770 -52.84898 -50.643849 -145.851172 8267.86
# ...
有什么建议吗?我花了一段时间来处理旧的SO问题,但似乎没有什么是正确的。很高兴被证明是错误的。
更新
@Amit的解决方案适用于我的玩具数据集:
apply(dat,1,function(x) spDistsN1(matrix(x[2:3],nrow=1),x[3:4],longlat=TRUE))
但是我认为我需要交换lat的顺序,长期的lat长列的顺序就在lat之前。来自?spDistsN1
:
pts: A matrix of 2D points, first column x/longitude, second column y/latitude, or a SpatialPoints or SpatialPointsDataFrame object
另外,除非我误解了逻辑,否则我认为Amit的解决方案应该抓住cols [2:3]和[4:5],而不是[2:3]和[3:4]。
我现在的挑战是将其应用于我的实际数据。我在下面复制了一部分。
library(sp)
dat <- structure(list(ID = 1:4,
subcounty = c("a", "b", "c", "d"),
pro.long = c(33.47627919, 31.73605491, 31.54073482, 31.51748984),
pro.lat = c(2.73996953, 3.26530095, 3.21327597, 3.17784981),
sub.long = c(33.47552, 31.78307, 31.53083, 31.53083),
sub.lat = c(2.740362, 3.391209, 3.208736, 3.208736)),
.Names = c("ID", "subcounty", "pro.long", "pro.lat", "sub.long", "sub.lat"),
row.names = c(NA, 4L), class = "data.frame")
head(dat)
# ID subcounty pro.long pro.lat sub.long sub.lat
# 1 1 a 33.47628 2.739970 33.47552 2.740362
# 2 2 b 31.73605 3.265301 31.78307 3.391209
# 3 3 c 31.54073 3.213276 31.53083 3.208736
# 4 4 d 31.51749 3.177850 31.53083 3.208736
apply(dat, 1, function(x) spDistsN1(matrix(x[3:4], nrow=1),
x[5:6],
longlat=TRUE))
我收到错误:Error in spDistsN1(matrix(x[3:4], nrow = 1), x[5:6], longlat = TRUE) : pts must be numeric
我很困惑因为这些列是数字的:
> is.numeric(dat$pro.long)
[1] TRUE
> is.numeric(dat$pro.lat)
[1] TRUE
> is.numeric(dat$sub.long)
[1] TRUE
> is.numeric(dat$sub.lat)
[1] TRUE
答案 0 :(得分:5)
您遇到的问题是apply(...)
强制执行矩阵的第一个参数。根据定义,矩阵必须具有相同数据类型的所有元素。由于dat
(dat$subcounty
)中的一列是char,apply(...)
将所有内容强制为char。在您的测试数据集中,所有内容都是数字的,因此您没有遇到此问题。
这应该有效:
dat$dist.km <- sapply(1:nrow(dat),function(i)
spDistsN1(as.matrix(dat[i,3:4]),as.matrix(dat[i,5:6]),longlat=T))
答案 1 :(得分:3)
使用data.table
和geosphere
提供了更快的解决方案。
library(data.table)
library(geosphere)
setDT(dat)[ , dist_km := distGeo(matrix(c(pro.long, pro.lat), ncol = 2),
matrix(c(sub.long, sub.lat), ncol = 2))/1000]
<强>基准:强>
library(sp)
jlhoward <- function(dat) { dat$dist.km <- sapply(1:nrow(dat),function(i)
spDistsN1(as.matrix(dat[i,3:4]),as.matrix(dat[i,5:6]),longlat=T)) }
rafa.pereira <- function(dat2) { setDT(dat2)[ , dist_km := distGeo(matrix(c(pro.long, pro.lat), ncol = 2),
matrix(c(sub.long, sub.lat), ncol = 2))/1000] }
> system.time( jlhoward(dat) )
user system elapsed
8.94 0.00 8.94
> system.time( rafa.pereira(dat) )
user system elapsed
0.07 0.00 0.08
dat <- structure(list(ID = 1:4,
subcounty = c("a", "b", "c", "d"),
pro.long = c(33.47627919, 31.73605491, 31.54073482, 31.51748984),
pro.lat = c(2.73996953, 3.26530095, 3.21327597, 3.17784981),
sub.long = c(33.47552, 31.78307, 31.53083, 31.53083),
sub.lat = c(2.740362, 3.391209, 3.208736, 3.208736)),
.Names = c("ID", "subcounty", "pro.long", "pro.lat", "sub.long", "sub.lat"),
row.names = c(NA, 4L), class = "data.frame")
# enlarge dataset to 40,000 pairs
dat <- dat[rep(seq_len(nrow(dat)), 10000), ]