从带有标签的N个向量快速计算共生矩阵

时间:2015-12-17 19:56:06

标签: r

我有一个矩阵,在其N行的每一行(聚类算法的迭代)中包含每个M点(列)所属的聚类:

例如:

CREATE TRIGGER trg_no_future_births
  BEFORE INSERT ON customer
  FOR EACH ROW
BEGIN
  IF( :new.dateOfBirth > sysdate )
  THEN
    RAISE_APPLICATION_ERROR( -20001, 'Date of birth cannot be in the future.' );
  END IF;
END;

我想建立一个共生矩阵,其中位置(i,j)是通过不同行在同一个群集中看到两个点的次数之和。

一种天真的方法是:

data <- t(rmultinom(50, size = 7, prob = rep(0.1,10)))

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,]    0    0    0    2    1    1    0    2    1     0
 [2,]    3    1    2    0    0    0    0    1    0     0
 [3,]    0    1    2    1    0    0    0    0    2     1
 [4,]    0    1    1    0    2    0    0    2    0     1
 [5,]    3    0    0    0    2    1    0    0    0     1
 [6,]    0    1    2    0    0    1    1    2    0     0
 [7,]    0    1    0    1    0    1    1    2    1     0
 [8,]    3    0    0    2    0    0    0    1    0     1
 ...

我怎样才能让它更快?

额外:如何使用 coincidences <- matrix(0, nrow=10, ncol=10) for (n in 1:50){ for (m in 1:10){ coincidences[m,] <- coincidences[m,] + as.numeric(data[n,m] == data[n,]) } } [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] 50 17 21 22 15 14 16 20 18 18 [2,] 17 50 17 14 17 18 15 14 20 16 [3,] 21 17 50 20 21 16 16 13 16 20 [4,] 22 14 20 50 16 18 16 21 18 14 [5,] 15 17 21 16 50 18 16 17 11 17 [6,] 14 18 16 18 18 50 18 22 25 13 [7,] 16 15 16 16 16 18 50 14 20 22 [8,] 20 14 13 21 17 22 14 50 11 15 [9,] 18 20 16 18 11 25 20 11 50 18 [10,] 18 16 20 14 17 13 22 15 18 50 绘制它? (我在ggplot2看过heatmap.2,但我不知道这是否有点过分了)

2 个答案:

答案 0 :(得分:4)

RCPP

使用Rcpp软件包在R中使用C ++实现可以尽可能快地完成工作

library(Rcpp)

data <- t(rmultinom(50, size = 7, prob = rep(0.1,10)))
    coincidences <- matrix(0, nrow=10, ncol=10)

#R implementation
fR<-function(data,coincidences){
for (n in 1:50){ 
    for (m in 1:10){

            coincidences[m,] <- coincidences[m,] + as.numeric(data[n,m] == data[n,])

    }

}
    return(coincidences)
}


#C++ Implementation
cppFunction('NumericMatrix fC(NumericMatrix data, NumericMatrix coincidences ) {

    int nrow = data.nrow(), ncol = coincidences.ncol();
    NumericMatrix out(nrow, ncol);
    int addon;


  for (int n = 0; n < nrow; n++) {
     for (int m = 0; m < ncol; m++) {
       for (int p = 0; p < nrow; p++) {

            if( data(n,m) == data(n,p) ){
                addon = 1;
            }else {
                addon = 0;
            }

            coincidences(m,p) = coincidences(m,p) + addon;


        }

    }

  }
  return coincidences;
}')

 #Call functions
 coincidences <- matrix(0, nrow=10, ncol=10)
 c1<-fC(data,coincidences)
 coincidences <- matrix(0, nrow=10, ncol=10)
 c2<-fR(data,coincidences)
 all.equal(c1,c2)
 > TRUE


 library(microbenchmark)
 microbenchmark(fC(data,coincidences),fR(data,coincidences))

>    Unit: microseconds
                       expr     min      lq      mean  median      uq     max neval
     fC(data, coincidences)   6.415   6.736   8.88454   7.698   8.660  74.727   100
     fR(data, coincidences) 283.514 290.089 301.84637 293.456 309.973 388.388   100

修改

要绘制:

library(reshape2)
C<-fC(data,coincidences)
ggplot(melt(C), aes(Var1,Var2, fill=value)) + geom_raster()

答案 1 :(得分:1)

使用矢量化和colSums的更快方式:

> set.seed(1)
> data <- t(rmultinom(10000, size = 7, prob = rep(0.1,100)))
>
> system.time({
+  coincidences <- matrix(0, nrow=100, ncol=100)
+  for (n in 1:10000){
+    for (m in 1:100){
+      coincidences[m,] <- coincidences[m,] + as.numeric(data[n,m] == data[n,])
+      }
+    }}
+ )
   user  system elapsed
  9.692   0.000   9.708
>
> system.time(coincidences2<-sapply(1:ncol(data), function(i){ colSums(data[,i]==data) }))
   user  system elapsed
  0.676   0.096   0.774
>
> all.equal(coincidences2,coincidences)
[1] TRUE