我有以下数据集:
id x y age
1 1745353 930284.1 30
2 1745317 930343.4 23
3 1745201 930433.9 10
4 1745351 930309.4 5
5 1745342 930335.2 2
6 1746619 929969.7 66
7 1746465 929827.1 7
8 1746731 928779.5 55
9 1746629 929902.6 26
10 1745938 928923.2 22
我想基于从给定(x,y)计算的距离为每个id找到5个最接近的邻居。最终输出应如下所示:
id n_id dist age age_n_id
1 2 2 30 23
1 5 1.5 30 2
1 3 5 30 10
1 7 3 30 7
1 8 3 30 55
2 1 6 23 30
2 10 1 23 22
2 6 2 23 66
2 7 6 23 7
2 8 9 23 55
3 2 1 10 23
3 1 2 10 30
3 4 1.2 10 5
3 6 1.6 10 66
3 9 2.3 10 26
................................
................................
10 2 1.9 22 23
10 6 2.3 22 66
10 9 2.1 22 26
10 1 2.5 22 30
10 5 1.6 22 2
其中n_id是id,如果邻居,dist是id和n_id之间的直线距离,age是id的年龄,age_n_id是n_id的年龄。此外,最大距离为10公里。如果在10km内有少于5个邻居,比如3个邻居,则相应的id将仅重复三次。
我在编程方面比较新,任何帮助都会非常感激。
答案 0 :(得分:2)
data.table解决方案:
library(data.table)
data<-fread("id x y age
1 1745353 930284.1 30
2 1745317 930343.4 23
3 1745201 930433.9 10
4 1745351 930309.4 5
5 1745342 930335.2 2
6 1746619 929969.7 66
7 1746465 929827.1 7
8 1746731 928779.5 55
9 1746629 929902.6 26
10 1745938 928923.2 22")
data[,all_x:=list(list(x))]
data[,all_y:=list(list(y))]
data[,all_age:=list(list(age))]
data[,seq_nr:=seq_len(.N)]
#Distance formula:
formula_distance<-function(x_1,x_2,y_1,y_2,z){
x_2<-x_2[[1]][-z]
y_2<-y_2[[1]][-z]
sqrt((x_1-x_2)^2+(y_1-y_2)^2)
}
data<-data[,{list(dist = formula_distance(x,all_x,y,all_y,seq_nr),
id =seq(1:nrow(data))[-id],
age_id=all_age[[1]][-id],
age=rep(age,nrow(data)-1))},by=1:nrow(data)]
data<-data[order(nrow,dist)]
#Filter data within threshold:
threshold<-1000
#How many nearest neighbors to take:
k<-5
filtered<-data[dist<=threshold]
filtered<-filtered[,{list(dist=dist[1:k],n_id=id[1:k],n_age=age_id[1:k])},by=c("nrow","age")]
filtered<-filtered[!is.na(dist)]
setnames(filtered,"nrow","id")
filtered
id age dist n_id n_age
1: 1 30 25.37893 4 5
2: 1 30 52.27055 5 2
3: 1 30 69.37211 2 23
4: 1 30 213.41050 3 10
5: 2 23 26.31045 5 2
6: 2 23 48.08326 4 5
7: 2 23 69.37211 1 30
8: 2 23 147.12665 3 10
9: 3 10 147.12665 2 23
10: 3 10 172.11243 5 2
11: 3 10 194.93653 4 5
12: 3 10 213.41050 1 30
13: 4 5 25.37893 1 30
14: 4 5 27.32471 5 2
15: 4 5 48.08326 2 23
16: 4 5 194.93653 3 10
17: 5 2 26.31045 2 23
18: 5 2 27.32471 4 5
19: 5 2 52.27055 1 30
20: 5 2 172.11243 3 10
21: 6 66 67.84106 9 26
22: 6 66 209.88273 7 7
23: 7 7 180.54432 9 26
24: 7 7 209.88273 6 66
25: 8 55 805.91482 10 22
26: 9 26 67.84106 6 66
27: 9 26 180.54432 7 7
28: 10 22 805.91482 8 55
答案 1 :(得分:1)
假设坐标单位为米。
# Load packages
library(FNN)
library(tidyverse)
library(data.table)
# Create example data frame
dataset <- fread("id x y age
1 1745353 930284.1 30
2 1745317 930343.4 23
3 1745201 930433.9 10
4 1745351 930309.4 5
5 1745342 930335.2 2
6 1746619 929969.7 66
7 1746465 929827.1 7
8 1746731 928779.5 55
9 1746629 929902.6 26
10 1745938 928923.2 22")
# Calculate the nearest ID and distance
near_data <- get.knn(dataset[, 2:3], k = 5)
# Extract the nearest ID
nn_index <- as.data.frame(near_data$nn.index)
# Extract the nearest Distance
nn_dist <- as.data.frame(near_data$nn.dist)
# Re organize the data
nn_index2 <- nn_index %>%
# Add ID column
mutate(ID = 1:10) %>%
# Transform the data frame
gather(Rank, n_id, -ID)
nn_dist2 <- nn_dist %>%
# Add ID column
mutate(ID = 1:10) %>%
# Transform the data frame
gather(Rank, dist, -ID)
# Remove coordinates in dataset
dataset2 <- dataset %>% select(-x, -y)
# Create the final output
nn_final <- nn_index2 %>%
# Merge nn_index2 and nn_dist2
left_join(nn_dist2, by = c("ID", "Rank")) %>%
# Merge with dataset2 by ID and id
left_join(dataset2, by = c("ID" = "id")) %>%
# Merge with dataset2 by n_id and id
left_join(dataset2, by = c("n_id" = "id")) %>%
# Remove Rank
select(-Rank) %>%
# Rename column names
rename(id = ID, age = age.x, age_n_id = age.y) %>%
# Sort the data frame
arrange(id, dist) %>%
# Filter the dist < 10000 meters
filter(dist < 10000)