Question

我有一个包含41行和6列的矩阵。这就是第一部分的样子。

      X13  X15  X17  X19  X21  X23 
 [1,] "7"  "6"  "5"  "8"  "1"  "8" 
 [2,] "7"  "6"  "5"  "8"  "14" "3" 
 [3,] "7"  "6"  "1"  "3"  "12" "3" 
 [4,] "7"  "6"  "1"  "5"  "6"  "14"
 [5,] "2"  "6"  "1"  "5"  "16" "3" 
 [6,] "2"  "3"  "5"  "5"  "2"  "3" 
 [7,] "7"  "5"  "5"  "17" "7"  "3" 
 [8,] "7"  "2"  "5"  "2"  "2"  "14"
 [9,] "2"  "2"  "10" "10" "2"  "3" 
[10,] "2"  "2"  "10" "5"  "2"  "6"

我的目标是，将所有列彼此进行比较，并查看2列中有多少个数字相等。我试着这样做：

s <- sum(matrix[,1]==matrix[,2])

但是因为我需要比较所有可能的对，所以它没有效果。把它放在一个循环中会很好，但我不知道如何。

我想以6x6相似度矩阵的形式得到我的结果。像这样：

      X13 X15 X17 X19 X21 X23
 X13   0   0   3   2   2   3
 X15   0   0   9  11   4   6
 X17   3   9   0   5   1   3
 X19   2  11   5   0   9  10
 X21   2   4   1   9   0   9
 X23   3   6   3  10   9   0

如您所见，我希望在将列与iteslf进行比较时将零填充到矩阵中。

由于我是初学R用户，这个任务对我来说真的很复杂。我需要将这个比较用于50个矩阵，所以如果你能帮助我，我会很高兴。我将不胜感激任何提示/建议。我的英语也不太好，但我希望我能够很好地解释我的问题。：）

Answer 1

这是一个完全矢量化的解决方案，使用expand.grid来计算索引，colSums和matrix来包装结果。

#  Some reproducible 6x6 sample data
set.seed(1)
m <- matrix( sample(10,36,repl=TRUE) , ncol = 6 )
#     [,1] [,2] [,3] [,4] [,5] [,6]
#[1,]    3   10    7    4    3    5
#[2,]    4    7    4    8    4    6
#[3,]    6    7    8   10    1    5
#[4,]   10    1    5    3    4    2
#[5,]    3    3    8    7    9    9
#[6,]    9    2   10    2    4    7


#  Vector source for column combinations
n <- seq_len( ncol(m) )

#  Make combinations
id <- expand.grid( n , n )

#  Get result
out <- matrix( colSums( m[ , id[,1] ] == m[ , id[,2] ] ) , ncol = length(n) )
diag(out) <- 0
#    [,1] [,2] [,3] [,4] [,5] [,6]
#[1,]    0    1    1    0    2    0
#[2,]    1    0    0    1    0    0
#[3,]    1    0    0    0    1    0
#[4,]    0    1    0    0    0    0
#[5,]    2    0    1    0    0    1
#[6,]    0    0    0    0    1    0

Answer 2

一种非矢量化（但可能更有内存效率）的方法：

# Fancy way.
similarity.matrix<-apply(matrix,2,function(x)colSums(x==matrix))
diag(similarity.matrix)<-0


# More understandable. But verbose.
similarity.matrix<-matrix(nrow=ncol(matrix),ncol=ncol(matrix))
for(col in 1:ncol(matrix)){
  matches<-matrix[,col]==matrix
  match.counts<-colSums(matches)
  match.counts[col]<-0 # Set the same column comparison to zero.
  similarity.matrix[,col]<-match.counts
}

Answer 3

使用qdap包中的v_outer的方法：

library(qdapTools) #Using Simon's data

x <- v_outer(m, function(x, y) sum(x==y))
diag(x) <- 0

##    V1 V2 V3 V4 V5 V6
## V1  0  1  1  0  2  0
## V2  1  0  0  1  0  0
## V3  1  0  0  0  1  0
## V4  0  1  0  0  0  0
## V5  2  0  1  0  0  1
## V6  0  0  0  0  1  0

编辑我添加了基准：

set.seed(1)
matrix <- m <- matrix( sample(10,36,repl=TRUE) , ncol = 6 )

MATRIX <- function(){
    n <- seq_len( ncol(m) )
    id <- expand.grid( n , n )
    out <- matrix( colSums( m[ , id[,1] ] == m[ , id[,2] ] ) , ncol = length(n) )
    diag(out) <- 0
    out
}

V_OUTER <- function(){
    x <- v_outer(m, function(x, y) sum(x==y))
    diag(x) <- 0
    x
}

APPLY <- function(){
    similarity.matrix<-apply(matrix,2,function(x)colSums(x==matrix))
    diag(similarity.matrix)<-0
    similarity.matrix
}

library(microbenchmark)
(op <- microbenchmark( 
    MATRIX(),
    V_OUTER(),
    APPLY() ,  
times=1000L))

Unit: microseconds
      expr     min      lq  median      uq      max neval
  MATRIX() 243.980 264.972 277.101 286.898 1719.519  1000
 V_OUTER() 203.861 223.921 234.650 243.280 1579.570  1000
   APPLY()  96.566 108.228 112.893 118.025 1470.409  1000

R：在矩阵中成对比较所有列

3 个答案: