我有一些代码可以生成使用tidyverse函数的数据框中所有点之间的距离矩阵。但是,它的工作速度非常慢。有没有人知道做同样事情的方法更快?
示例数据和工作代码:
library(tidyverse)
locs <- data.frame(ID = 1:4000, x = runif (4000, 0, 1), y = runif (4000, 0, 1))
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
答案 0 :(得分:2)
您可以使用基本R函数dist
:
locs <- data.frame(ID = 1:10, x = runif (10, 0, 1), y = runif (10, 0, 1))
dist(locs[,2:3], upper = T, diag = T)
输出:
1 2 3 4 5 6 7 8 9 10
1 0.00000000 1.10309601 0.98790825 0.54490600 0.42478532 1.06323764 0.31094245 0.52593635 0.44695830 0.85010761
2 1.10309601 0.00000000 0.29292865 0.93412638 0.74551902 0.17160290 0.83557056 0.62393711 0.74218236 0.57669081
3 0.98790825 0.29292865 0.00000000 0.69626767 0.72278486 0.13085561 0.78064096 0.46359296 0.73098652 0.72732431
4 0.54490600 0.93412638 0.69626767 0.00000000 0.65426980 0.81617143 0.59851262 0.36551106 0.68253093 1.00018238
5 0.42478532 0.74551902 0.72278486 0.65426980 0.00000000 0.75537605 0.11384534 0.36844164 0.02911855 0.42844270
6 1.06323764 0.17160290 0.13085561 0.81617143 0.75537605 0.00000000 0.82826619 0.55014297 0.75867851 0.68258388
7 0.31094245 0.83557056 0.78064096 0.59851262 0.11384534 0.82826619 0.00000000 0.37224997 0.13688270 0.54088523
8 0.52593635 0.62393711 0.46359296 0.36551106 0.36844164 0.55014297 0.37224997 0.00000000 0.39086196 0.64185453
9 0.44695830 0.74218236 0.73098652 0.68253093 0.02911855 0.75867851 0.13688270 0.39086196 0.00000000 0.40400339
10 0.85010761 0.57669081 0.72732431 1.00018238 0.42844270 0.68258388 0.54088523 0.64185453 0.40400339 0.00000000
基准测试,包含1000条记录:
library(dplyr)
library(microbenchmark)
locs <- data.frame(ID = 1:1000, x = runif (1000, 0, 1), y = runif (1000, 0, 1))
f1 <- function()
{
df1 <- locs %>%
mutate(k = 1)
df2 <- df1 %>%
full_join(df1, by = "k") %>%
mutate(length = sqrt((x.x - x.y)^2 + (y.x - y.y)^2)) %>%
select(ID.x, ID.y, length)
dists <- matrix(data = df2$length, nrow = nrow(df1), ncol = nrow(df1))
}
f2 <- function(){dist(locs[,2:3],upper = T,diag=T)}
microbenchmark(f1())
microbenchmark(f2())
结果:
Unit: milliseconds
expr min lq mean median uq max neval
f1() 81.74188 245.8014 276.4318 259.7682 294.01 567.9409 100
和
Unit: milliseconds
expr min lq mean median uq max neval
f2() 6.956302 7.330661 8.675304 8.11507 8.981121 18.77783 100
希望这有帮助!