在R?
中按组计算一组坐标之间的最大距离的最有效方法是什么?示例数据: 我有这样的数据,但不是x10000(这是示例)我拥有的数据更像是2500万个条目。
library(data.table)
data <- data.table(latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE))
groupn <- nrow(data)/1000
data$group <- sample(seq(1,groupn,by=1),10000,replace=T)
我目前的方法很慢:
data <- data[order(data$group),]
library(dplyr)
library(sf)
library(foreach)
distlist <- foreach(i=1:10)%do%{
tempsf <- st_as_sf(filter(data,group==i), coords= c("longitude", "latitude"), crs=4326)
max(st_distance(tempsf, tempsf))
}
有些天才可以帮我加快速度吗?
答案 0 :(得分:2)
试试这个:
Euclidean dist:
echo '<a class="btn btn-info" href="'.$url.'">Video Link</a>';
WGS84:
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
user system elapsed
0.14 0.00 0.14
> out1
1 2 3 4 5 6 7 8 9 10
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123
Haversine方法:
> auxF <- function(x) {
+ require(sp)
+
+ tempsf <- data[x, 1:2]
+ coordinates(tempsf) <- c("longitude", "latitude")
+ proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
+ return(max(spDists(tempsf)))
+ }
>
> system.time(out2 <- tapply(1:nrow(data), data$group, auxF))
user system elapsed
4.71 0.00 4.76
> out2
1 2 3 4 5 6 7 8 9 10
19646.04 19217.48 19223.27 19543.99 19318.55 18856.65 19334.11 19679.45 18840.90 19460.14
对于700万条记录,您可以假设欧几里德距离或将您的点投影到平面,这样您就可以使用欧几里德距离,因为我们知道每组的凸包的点之间的最大距离减少了操作,并且不需要大量的RAM:
> system.time(out3 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,.(longitude,latitude)], fun=distHaversine)))))
user system elapsed
13.24 0.01 13.30
> out3
1 2 3 4 5 6 7 8 9 10
19644749 19216989 19223012 19542956 19317958 18856273 19333424 19677917 18840641 19459353
大数据:
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
user system elapsed
0.03 0.00 0.03
> out4
1 2 3 4 5 6 7 8 9 10
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123
答案 1 :(得分:2)
感谢Juan Antonio使用tapply的想法。 。 。我最终使用了你构建的sp函数,它是最快的。
auxF <- function(x) {
require(sp)
tempsf <- data[x, 1:2]
coordinates(tempsf) <- c("longitude", "latitude")
proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
return(max(spDists(tempsf)))
}
out1 <- tapply(1:nrow(data), data$group, auxF)
这也有效: dt.haversine @ SymbolixAU(像往常一样棒)built:
dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){
radians <- pi/180
lat_to <- lat_to * radians
lat_from <- lat_from * radians
lon_to <- lon_to * radians
lon_from <- lon_from * radians
dLat <- (lat_to - lat_from)
dLon <- (lon_to - lon_from)
a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2)
return(2 * atan2(sqrt(a), sqrt(1 - a)) * r)
}
library(geosphere)
out1 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,c("longitude","latitude")], fun=dt.haversine))))
答案 2 :(得分:0)
这是使用data.table
和.SD
的另一种方式
> library(data.table)
> data <- data.table(
+ latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
+ longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE)
+ )
> groupn <- nrow(data)/1000
> data$group <- sample(seq(1,groupn,by=1),10000,replace=T)
>
> way1 <- function() {
+ data[,
+ .(maxdist = max(
+ dist(
+ .SD[1:.N, .(latitude, longitude)]
+ )
+ )),
+ keyby = group
+ ]
+ }
>
> way2 <- function() {
+ tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2])))
+ }
>
> system.time(out1 <- way1())
user system elapsed
0.16 0.03 0.18
> out1
group maxdist
1: 1 196.7296
2: 2 195.9555
3: 3 196.0794
4: 4 196.3476
5: 5 195.2577
6: 6 196.0791
7: 7 198.5209
8: 8 196.6944
9: 9 195.2630
10: 10 194.4611
>
> system.time(out1 <- way2())
user system elapsed
0.22 0.10 0.60
> out1
1 2 3 4 5 6 7 8 9 10
196.7296 195.9555 196.0794 196.3476 195.2577 196.0791 198.5209 196.6944 195.2630 194.4611
>
> library(microbenchmark)
> microbenchmark(way1(), way2())
Unit: milliseconds
expr min lq mean median uq max neval cld
way1() 172.3232 231.3411 327.1674 266.9135 370.9586 1569.7742 100 a
way2() 181.7716 228.1266 346.2764 285.8394 444.8963 800.4725 100 a