如何有效地找到附近的整数?

时间:2013-01-24 06:04:47

标签: r matrix indexing

我有一个2列矩阵,具有唯一的整数坐标和分数:

> data1<-data.matrix(data.frame("coord"=sample(1:100,50),"scores"=rnorm(25)))
> data1
      coord      scores
 [1,]    22 -0.73799827
 [2,]    76 -0.78022310
 [3,]    62  0.45633095
 [4,]    77  0.56617413
 [5,]    60 -0.94876368
 [6,]    83 -1.20792643
 [7,]    85 -1.13890957
 [8,]    78  0.63959763
 [9,]    28  0.28039908
[10,]    68 -1.04277456
[11,]    27  0.48755194
[12,]    66  0.09612861
[13,]    69 -1.60932063
[14,]     6 -0.66797103
[15,]    10 -0.56594989
[16,]    50 -0.79548555
[17,]    39  1.13064066
[18,]    75  0.21617203
[19,]    34 -0.13480437
[20,]    54 -1.64825097
[21,]    48 -0.97955118
[22,]    58  0.55307028
[23,]    11 -0.99319227
[24,]    42 -0.58430293
[25,]    37  1.76576096
[26,]    67 -0.73799827
[27,]    65 -0.78022310
[28,]    47  0.45633095
[29,]    72  0.56617413
[30,]    97 -0.94876368
[31,]    57 -1.20792643
[32,]    38 -1.13890957
[33,]    16  0.63959763
[34,]    15  0.28039908
[35,]    86 -1.04277456
[36,]    33  0.48755194
[37,]    80  0.09612861
[38,]     2 -1.60932063
[39,]    93 -0.66797103
[40,]    73 -0.56594989
[41,]    40 -0.79548555
[42,]    26  1.13064066
[43,]    13  0.21617203
[44,]    96 -0.13480437
[45,]    41 -1.64825097
[46,]    59 -0.97955118
[47,]    46  0.55307028
[48,]    43 -0.99319227
[49,]    94 -0.58430293
[50,]    21  1.76576096

和唯一坐标向量:

> centers
 [1]  39  31  61  16  48  82  42  76  71  43  93  35   6 100  67  81  70  79  45  17  96  78  69  95  29

我想创建一个矩阵,用于映射data1相对于中心的分数,其中每个中心是矩阵的中间,每行一个中心。换句话说,在矩阵中,我想看到每个“中心”附近有坐标的分数。我采取了以下方法:

> score_matrix<-matrix(nrow=length(centers),ncol=10)
> for(i in 1:length(centers)){
+ data2 <- data1
+ data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix)/2
+ region_scores <- subset(data2,data2[,1] > 0 & data2[,1] <= ncol(score_matrix))
+ score_matrix[i,region_scores[,1]]<-region_scores[,2]
+ }
> print(score_matrix)
            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]       [,7]       [,8]       [,9]      [,10]
 [1,] -0.8688788  0.4524561  1.4594981 -1.0552725 -0.1594024         NA -0.4122056         NA         NA         NA
 [2,] -1.0552725  1.5064965         NA -1.8956159         NA         NA         NA  0.7000265         NA         NA
 [3,]         NA         NA         NA -0.7334736         NA         NA -1.8381591 -1.8381591 -0.7334736         NA
 [4,]         NA         NA -0.3910595  1.5064965         NA -0.1006090  0.1064373  0.4524561         NA         NA
 [5,]         NA         NA  0.8967748         NA         NA         NA         NA  0.8458699 -0.1006090         NA
 [6,]         NA         NA -1.8381591 -1.8381591 -0.7334736         NA         NA         NA         NA         NA
 [7,] -1.3803871 -1.5606603         NA  0.8967748 -0.7036330         NA         NA         NA         NA -1.6780760
 [8,]         NA         NA         NA         NA  0.8458699 -0.1006090         NA -1.5606603         NA         NA
 [9,]         NA         NA         NA -1.3673480  1.8448811  1.1304699         NA -0.8317189  0.1064373 -1.4426410
[10,]  0.8967748 -0.7036330         NA         NA         NA         NA -1.6780760 -0.3910595         NA         NA
[11,]  1.1304699         NA         NA -1.0552725  1.5064965         NA -1.8956159         NA         NA         NA
[12,]         NA         NA -1.6780760 -0.7036330         NA         NA  0.8967748         NA         NA         NA
[13,]         NA         NA         NA -1.6780760 -0.7036330         NA         NA  0.8967748         NA         NA
[14,]         NA  0.8458699 -0.1006090         NA -1.5606603         NA         NA  0.8458699 -0.1594024         NA
[15,] -0.1006090         NA -1.5606603         NA         NA  0.8458699 -0.1594024         NA         NA -0.3910595
[16,]  1.8448811  1.1304699         NA -0.8317189  0.1064373 -1.4426410         NA  1.8448811         NA -1.4426410
[17,]         NA         NA -1.0552725  1.5064965         NA -1.8956159         NA         NA         NA  0.7000265
[18,]         NA         NA         NA         NA         NA -1.3673480  1.8448811  1.1304699         NA -0.8317189
[19,]         NA         NA         NA         NA -0.7334736         NA         NA -1.8381591 -1.8381591 -0.7334736
[20,]         NA         NA  0.7000265         NA         NA         NA -0.8688788  0.4524561  1.4594981 -1.0552725
[21,]         NA -1.3803871 -1.5606603         NA  0.8967748 -0.7036330         NA         NA         NA         NA
[22,] -0.7334736         NA         NA         NA         NA         NA -1.3673480  1.8448811  1.1304699         NA
[23,]         NA         NA         NA         NA -1.3673480  1.8448811  1.1304699         NA -0.8317189  0.1064373
[24,]         NA  1.4594981  0.7000265         NA -1.3673480 -0.8688788  1.1304699         NA         NA -1.0552725
[25,] -0.8317189  0.1064373 -1.4426410         NA  1.8448811         NA -1.4426410 -1.8956159         NA  1.4594981

但是,我应用它的数据集非常大,脚本需要大约24小时才能完成。有没有办法更有效地完成同样的事情?

谢谢,

2 个答案:

答案 0 :(得分:2)

我在Rcpp中实现你的功能:

data1 <-data.matrix(data.frame("coord"=sample(1:100,50),"scores"=rnorm(25)))
centers <- unique(data1[,1])
score_matrix<-matrix(nrow=length(centers),ncol=10)
for(i in 1:length(centers)){
 data2 <- data1
 data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix)/2
 region_scores <- subset(data2,data2[,1] > 0 & data2[,1] <= ncol(score_matrix))
 score_matrix[i,region_scores[,1]]<-region_scores[,2]
 }

library(inline)
library(Rcpp)

src <- '
  NumericMatrix data1(Rdata1);
  NumericVector centers(Rcenters);
  NumericMatrix score_matrix(Rscore_matrix);
  NumericVector data2(data1.nrow());
  for(int i = 0;i < centers.size();i++) {
    data2 = data1.column(0);
    data2 = data2 - centers(i) + score_matrix.ncol() / 2;
    for(int j = 0, k = 0;j < data2.size();j++) { // subset part
      if (data2(j) <= 0)
        continue;
      if (data2(j) > score_matrix.ncol())
        continue;
      score_matrix(i, data2(j) - 1) = data1(j,1);
    }
  }
  return score_matrix;
'

f <- cxxfunction(sig=c(Rdata1 = "numeric", Rcenters = "numeric", Rscore_matrix = "numeric"), 
  plugin="Rcpp", body=src)

score_matrix2<-matrix(nrow=length(centers),ncol=10)
score_matrix2 <- f(data1, centers, score_matrix2)
all.equal(score_matrix, score_matrix2)

library(rbenchmark) 

benchmark({
  score_matrix<-matrix(nrow=length(centers),ncol=10)
  for(i in 1:length(centers)){
   data2 <- data1
   data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix)/2
   region_scores <- subset(data2,data2[,1] > 0 & data2[,1] <= ncol(score_matrix))
   score_matrix[i,region_scores[,1]]<-region_scores[,2]
   }
}, {
  score_matrix2<-matrix(nrow=length(centers),ncol=10)
  score_matrix2 <- f(data1, centers, score_matrix2)
})

在上次测试中,Rcpp比我的机器上的R快一倍。


如果速度不够快,您可能希望并行化算法。

尝试R包snow并重新设计您的算法以获得除法数据和合并结果。

答案 1 :(得分:1)

R重新实施提供了47倍的加速。这是我原始代码的实现

f0 <- function(data1, centers) {
    score_matrix <- matrix(nrow=length(centers), ncol=10)
    for(i in seq_along(centers)) {
        data2 <- data1
        data2[,1] <- data2[,1] - centers[i] + ncol(score_matrix) / 2
        idx <- data2[,1] > 0 & data2[,1] <= ncol(score_matrix)
        region_scores <- data2[idx,]
        score_matrix[i,region_scores[,1]] <- region_scores[,2]
    }
    score_matrix
}

我提取了常见的计算(centers[i] - ncol(score_matrix) / 2和子集data1)来获取

f1 <- function(data1, centers, ncol=10) {
    score_matrix <- matrix(NA_real_, length(centers), ncol)
    ccenters <- centers - ncol / 2
    d1 <- data1[,1]
    d2 <- data1[,2]
    for (i in seq_along(ccenters)) {
        score <- d1 - ccenters[i]
        idx <- score > 0 & score <= ncol
        score_matrix[i, score[idx]] <- d2[idx]
    }
    score_matrix
}

for循环应该编译好

library(compiler)
f1c <- cmpfun(f1)

library(rbenchmark)
data1 <- data.frame(coord=sample(100,50), scores=rnorm(25))
centers <- sort(scan(textConnection("39 31 61 16 48 82 42 76 71 43 93 35 6
                                     100 67 81 70 79 45 17 96 78 69 95 29")))

给出

> identical(f0(data1, centers), f1(data1, centers))
[1] TRUE
> identical(f0(data1, centers), f1c(data1, centers))
[1] TRUE
> benchmark(f0(data1, centers), f1(data1, centers), f1c(data1, centers),
+           replications=10, columns=c("test", "elapsed", "relative"))
                 test elapsed relative
1  f0(data1, centers)   0.139   46.333
3 f1c(data1, centers)   0.003    1.000
2  f1(data1, centers)   0.005    1.667

原始问题中的目标似乎有些不完整 - 你打算用10,000 x 10矩阵做什么?