Question

我有以下数据集：

id    x         y       age
1  1745353   930284.1    30
2  1745317   930343.4    23
3  1745201   930433.9    10
4  1745351   930309.4    5
5  1745342   930335.2    2
6  1746619   929969.7    66
7  1746465   929827.1    7
8  1746731   928779.5    55
9  1746629   929902.6    26
10 1745938   928923.2    22

我想基于从给定（x，y）计算的距离为每个id找到5个最接近的邻居。最终输出应如下所示：

id      n_id    dist  age  age_n_id
1        2       2     30    23
1        5       1.5   30    2
1        3       5     30    10
1        7       3     30    7
1        8       3     30    55
2        1       6     23    30
2        10      1     23    22
2        6       2     23    66
2        7       6     23    7
2        8       9     23    55
3        2       1     10    23
3        1       2     10    30
3        4       1.2   10    5
3        6       1.6   10    66
3        9       2.3   10    26
................................
................................
10       2       1.9   22    23
10       6       2.3   22    66
10       9       2.1   22    26
10       1       2.5   22    30
10       5       1.6   22    2

其中n_id是id，如果邻居，dist是id和n_id之间的直线距离，age是id的年龄，age_n_id是n_id的年龄。此外，最大距离为10公里。如果在10km内有少于5个邻居，比如3个邻居，则相应的id将仅重复三次。

我在编程方面比较新，任何帮助都会非常感激。

Answer 1

data.table解决方案：

library(data.table)
data<-fread("id    x         y       age
1  1745353   930284.1    30
            2  1745317   930343.4    23
            3  1745201   930433.9    10
            4  1745351   930309.4    5
            5  1745342   930335.2    2
            6  1746619   929969.7    66
            7  1746465   929827.1    7
            8  1746731   928779.5    55
            9  1746629   929902.6    26
            10 1745938   928923.2    22")

data[,all_x:=list(list(x))]
data[,all_y:=list(list(y))]
data[,all_age:=list(list(age))]
data[,seq_nr:=seq_len(.N)]

#Distance formula:
formula_distance<-function(x_1,x_2,y_1,y_2,z){
  x_2<-x_2[[1]][-z]
  y_2<-y_2[[1]][-z]
  sqrt((x_1-x_2)^2+(y_1-y_2)^2)
}

data<-data[,{list(dist = formula_distance(x,all_x,y,all_y,seq_nr), 
                  id =seq(1:nrow(data))[-id],
                  age_id=all_age[[1]][-id],
                  age=rep(age,nrow(data)-1))},by=1:nrow(data)]
data<-data[order(nrow,dist)]
#Filter data within threshold:
threshold<-1000

#How many nearest neighbors to take:
k<-5
filtered<-data[dist<=threshold]
filtered<-filtered[,{list(dist=dist[1:k],n_id=id[1:k],n_age=age_id[1:k])},by=c("nrow","age")]
filtered<-filtered[!is.na(dist)]
setnames(filtered,"nrow","id")

filtered
    id age      dist n_id n_age
 1:  1  30  25.37893    4     5
 2:  1  30  52.27055    5     2
 3:  1  30  69.37211    2    23
 4:  1  30 213.41050    3    10
 5:  2  23  26.31045    5     2
 6:  2  23  48.08326    4     5
 7:  2  23  69.37211    1    30
 8:  2  23 147.12665    3    10
 9:  3  10 147.12665    2    23
10:  3  10 172.11243    5     2
11:  3  10 194.93653    4     5
12:  3  10 213.41050    1    30
13:  4   5  25.37893    1    30
14:  4   5  27.32471    5     2
15:  4   5  48.08326    2    23
16:  4   5 194.93653    3    10
17:  5   2  26.31045    2    23
18:  5   2  27.32471    4     5
19:  5   2  52.27055    1    30
20:  5   2 172.11243    3    10
21:  6  66  67.84106    9    26
22:  6  66 209.88273    7     7
23:  7   7 180.54432    9    26
24:  7   7 209.88273    6    66
25:  8  55 805.91482   10    22
26:  9  26  67.84106    6    66
27:  9  26 180.54432    7     7
28: 10  22 805.91482    8    55

Answer 2

假设坐标单位为米。

# Load packages
library(FNN)
library(tidyverse)
library(data.table)

# Create example data frame
dataset <- fread("id    x         y       age
1  1745353   930284.1    30
2  1745317   930343.4    23
3  1745201   930433.9    10
4  1745351   930309.4    5
5  1745342   930335.2    2
6  1746619   929969.7    66
7  1746465   929827.1    7
8  1746731   928779.5    55
9  1746629   929902.6    26
10 1745938   928923.2    22")

# Calculate the nearest ID and distance
near_data <- get.knn(dataset[, 2:3], k = 5)

# Extract the nearest ID
nn_index <- as.data.frame(near_data$nn.index)

# Extract the nearest Distance
nn_dist <- as.data.frame(near_data$nn.dist)

# Re organize the data
nn_index2 <- nn_index %>%
  # Add ID column
  mutate(ID = 1:10) %>%
  # Transform the data frame
  gather(Rank, n_id, -ID)

nn_dist2 <- nn_dist %>%
  # Add ID column
  mutate(ID = 1:10) %>%
  # Transform the data frame
  gather(Rank, dist, -ID)

# Remove coordinates in dataset
dataset2 <- dataset %>% select(-x, -y)

# Create the final output
nn_final <- nn_index2 %>% 
  # Merge nn_index2 and nn_dist2
  left_join(nn_dist2, by = c("ID", "Rank")) %>%
  # Merge with dataset2 by ID and id
  left_join(dataset2, by = c("ID" = "id")) %>%
  # Merge with dataset2 by n_id and id
  left_join(dataset2, by = c("n_id" = "id")) %>%
  # Remove Rank
  select(-Rank) %>%
  # Rename column names
  rename(id = ID, age = age.x, age_n_id = age.y) %>%
  # Sort the data frame
  arrange(id, dist) %>%
  # Filter the dist < 10000 meters
  filter(dist < 10000)

基于r中给定距离的5个最近邻居

2 个答案: