找到最近的匹配点

时间:2018-05-16 19:29:41

标签: r

我想要做的是红点找到abline另一侧最近的等效蓝点(即1,5找到5,1)。

enter image description here

数据:

https://1drv.ms/f/s!Asb7WztvacfOuesIq4evh0jjvejZ4Q

修改:打开数据执行readRDS("path/to/data")

所以我试过的是找到x和y坐标之间的差异,对它们进行排名,然后找到x和y的排名最小值。结果还不错。我正在努力寻找的方法是找到最接近的元组匹配。

我的尝试:

find_nearest <- function(query, subject){

  weight_df <- data.frame(ID=query$ID)
  #find difference of first, then second, rank and find match in both going from top to bottom
  tmp_df <- query

  for(i in 1:nrow(subject)){
    first_order <- order(abs(query$mean_score_n-subject$mean_score_n[i]))
    second_order <- order(abs(query$mean_score_p-subject$mean_score_p[i]))

    tmp_df$order_1[first_order] <- seq(1, nrow(tmp_df))
    tmp_df$order_2[second_order] <- seq(1, nrow(tmp_df))

    weight_df[,i+1] <- tmp_df$order_1 + tmp_df$order_2

  }
  rownames(weight_df) <- weight_df$ID
  weight_df$ID <- NULL

  print(dim(weight_df))

  nearest_match <- list()
  count <- 1
  subject_ids <- NA
  query_ids <- NA
  while(ncol(weight_df) > 0 & count <= ncol(weight_df)){
    pos <- which(weight_df == min(weight_df, na.rm = TRUE), arr.ind = TRUE)
    if(length(unique(rownames(pos))) > 1){
      for(i in nrow(pos)){
        #if subject/query already used then mask and find another
        if(subject$ID[pos[i,2]] %in% subject_ids){
          weight_df[pos[i,1],pos[i,2]] <- NA
        }else if(query$ID[pos[i,1]] %in% query_ids){
          weight_df[pos[i,1],pos[i,2]] <- NA 
        }else{
          subject_ids <- c(subject_ids, subject$ID[pos[i,2]])
          query_ids <- c(query_ids, query$ID[pos[i,1]])
          nearest_match[[count]] <- data.frame(query=query[pos[i,1],]$ID, subject=subject[pos[i,2],]$ID)
          #mask
          weight_df[pos[i,1],pos[i,2]] <- NA

          count <- count + 1
        }
      }
    }else if(nrow(pos) > 1){
      #if subject/query already used then mask and find another
      if(subject$ID[pos[1,2]] %in% subject_ids){
        weight_df[pos[1,1],pos[1,2]] <- NA
      }else if(query$ID[pos[1,1]] %in% query_ids){
        weight_df[pos[1,1],pos[1,2]] <- NA 
      }else{
        subject_ids <- c(subject_ids, subject$ID[pos[1,1]])
        query_ids <- c(query_ids, query$ID[pos[1,1]])
        nearest_match[[count]] <- data.frame(query=query[pos[1,1],]$ID, subject=subject[pos[1,2],]$ID)
        #mask
        weight_df[pos[1,1],pos[1,2]] <- NA

        count <- count + 1
      }
    }else{
      #if subject/query already used then mask and find another
      if(subject$ID[pos[2]] %in% subject_ids){
        weight_df[pos[1],pos[2]] <- NA
      }else if(query$ID[pos[1]] %in% query_ids){
        weight_df[pos[1],pos[2]] <- NA 
      }else{
        subject_ids <- c(subject_ids, subject$ID[pos[2]])
        query_ids <- c(query_ids, query$ID[pos[1]])
        nearest_match[[count]] <- data.frame(query=query[pos[1],]$ID, subject=subject[pos[2],]$ID)
        #mask
        weight_df[pos[1],pos[2]] <- NA

        count <- count + 1
      }
    }
  }

  out <- plyr::ldply(nearest_match, rbind)

  out <- merge(out, data.frame(subject=subject$ID, 
                                 mean_score_p_n=subject$mean_score_p, 
                                 mean_score_n_n= subject$mean_score_n), by="subject", all.x=TRUE)

  out <- merge(out, data.frame(query=query$ID, 
                                 mean_score_p_p=query$mean_score_p, 
                                 mean_score_n_p= query$mean_score_n), by="query", all.x=TRUE)


  return(out)

}

编辑:这是解决方案的样子吗?

ggplot() +
  geom_point(data=B[out,], aes(x=mean_score_p, y= mean_score_n, color="red")) +
  geom_point(data=A, aes(x=mean_score_p, y=mean_score_n, color="blue")) +
  geom_abline(intercept = 0, slope = 1)

enter image description here

1 个答案:

答案 0 :(得分:3)

query <- readRDS("query.dms")
subject <- readRDS("subject.dms")
kA <- nrow(subject)
kB <- nrow(query)
A <- as.matrix(subject[, 2:3])
B <- as.matrix(query[, 2:3])

我们想要找到最接近的&#34;反向&#34; B中的{(1}}点(行)到A中的每个点。

允许非独特结果的解决方案

然后,假设您使用欧几里德距离,

D <- as.matrix(dist(rbind(A, B[, 2:1])))[(1 + kA):(kA + kB), 1:kA]
unname(apply(D, 2, which.min))
#  [1] 268 183 350 284  21 360 132 287 100 298  58  56 170  70  47 305 353
# [18]  43 266 198  58 215 198 389 412 321 255 181  79 340 292 268 198  54
# [35] 390  38 376  47  19  94 244  18 168 201 160 194 114 247 287 273 182
# [52]  87  94  87 192  63 160 244 101 298  62

B中的相应行号。诀窍是使用B切换B[, 2:1]中点的坐标。

具有独特结果的解决方案

out <- vector("numeric", length = kA)
colnames(D) <- 1:ncol(D)
rownames(D) <- 1:nrow(D)
while(any(out == 0))
  for(i in 1:nrow(D)) {
    aux <- apply(D, 2, which.min)
    if(i %in% aux) {
      win <- which(aux == i)[which.min(D[i, aux == i])]
      out[as.numeric(names(win))] <- as.numeric(rownames(D)[i])
      D <- D[-i, -win, drop = FALSE]
    }
  }
out
#  [1] 268 183 350 284  21 360 132 213 100 298  22  56 170  70 128 305 353
# [18]  43 266 198  58 215 294 389 412 321 255 181  79 340 292  20 347  54
# [35] 390  38 376  47  19  94  73  18 168 201 160 194 114 247 287 273 182
# [52]  87 365 158 192  63 211 244 101  68  62

,而

all(table(res) == 1)
# [1] TRUE

证实了独特性。解决方案效率不高,但在数据集上只需几秒钟。这需要一些时间,因为它会继续检查B中的所有可用点,检查它是否是A中任何点的最近点。如果是,则B中的对应点将分配给A中最近的一个。然后从距离矩阵中消除A中的点和B中的点。循环一直持续到A中的每个点在B中都有一些匹配。