我试图将每一行与矩阵中的所有其他行进行比较,以计算每行与所有其他行的差异数。然后将结果存储在矩阵的左下角三角形中。
因此,例如当行m [1,]与行m [2,]和m [3,]进行比较时,差值计数存储在结果中mat [c(2:3),1]的位置基质
我的问题是我的输入矩阵最多可以有1e + 07行,并且由于n ^ 2比较,当前的实现(速度和内存)不会缩放。建议和帮助将不胜感激。
diffMatrix <- function(x) {
rows <- dim(x)[1] #num of rows
cols <- dim(x)[2] #num of columns
if (rows <= 1) stop("'x' must have atleast two rows")
#potential failure point
mat <- matrix(, rows, rows)
# fill bottom left triangle columns ignoring the diagonal
for (row in 1:(rows-1)) {
rRange <- c((1+row):rows)
m <- matrix(x[row,], nrow=rows-row, ncol=cols, byrow = T)
mat[rRange, row] <- rowSums(m != x[-1:-row, ])
}
return (mat)
}
m <- matrix(sample(1:12, 12, replace=T), ncol=2, byrow=TRUE)
m
# [,1] [,2]
#[1,] 8 1
#[2,] 4 1
#[3,] 8 4
#[4,] 4 5
#[5,] 3 1
#[6,] 2 2
x <- diffMatrix(m)
x
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] NA NA NA NA NA NA
#[2,] 1 NA NA NA NA NA
#[3,] 1 2 NA NA NA NA
#[4,] 2 1 2 NA NA NA
#[5,] 1 1 2 2 NA NA
#[6,] 2 2 2 2 2 NA
m <- matrix(sample(1:5, 50000, replace=T), ncol=10, byrow=TRUE)
# system.time(x <- diffMatrix(m))
# user system elapsed
# 20.39 0.38 21.43
答案 0 :(得分:2)
以下是使用.Call
的替代方法(似乎有效,但我无法保证):
library(inline)
ff = cfunction(sig = c(R_mat = "matrix"), body = '
SEXP mat, dims, ans, dimans;
PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
PROTECT(dimans = allocVector(INTSXP, 2));
R_len_t *pdims = INTEGER(dims), *pdimans = INTEGER(dimans);
PROTECT(ans = allocVector(INTSXP, pdims[0]*pdims[0]));
R_len_t *pans = INTEGER(ans);
pdimans[0] = pdims[0];
pdimans[1] = pdims[0];
for(int ina = 0; ina < LENGTH(ans); ina++) {
pans[ina] = NA_INTEGER;
}
switch(TYPEOF(R_mat)) {
case REALSXP:
{
PROTECT(mat = coerceVector(R_mat, REALSXP));
double *pmat = REAL(mat);
for(int i = 0; i < pdims[0]; i++) {
R_len_t ir;
for(ir = i+1; ir < pdims[0]; ir++) {
R_len_t diffs = 0;
for(int ic = 0; ic < pdims[1]; ic++) {
if(pmat[i + ic*pdims[0]] != pmat[ir + ic*pdims[0]]) {
diffs++;
}
}
pans[ir + i*pdims[0]] = diffs;
}
}
break;
}
case INTSXP:
{
PROTECT(mat = coerceVector(R_mat, INTSXP));
R_len_t *pmat = INTEGER(mat);
for(int i = 0; i < pdims[0]; i++) {
R_len_t ir;
for(ir = i+1; ir < pdims[0]; ir++) {
R_len_t diffs = 0;
for(int ic = 0; ic < pdims[1]; ic++) {
if(pmat[i + ic*pdims[0]] != pmat[ir + ic*pdims[0]]) {
diffs++;
}
}
pans[ir + i*pdims[0]] = diffs;
}
}
break;
}
}
setAttrib(ans, R_DimSymbol, dimans);
UNPROTECT(4);
return(ans);
')
m = matrix(c(8,4,8,4,3,2,1,1,4,5,1,2), ncol = 2)
ff(m)
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] NA NA NA NA NA NA
#[2,] 1 NA NA NA NA NA
#[3,] 1 2 NA NA NA NA
#[4,] 2 1 2 NA NA NA
#[5,] 1 1 2 2 NA NA
#[6,] 2 2 2 2 2 NA
all.equal(diffMatrix(m), ff(m))
#[1] TRUE
m2 <- matrix(sample(1:5, 50000, replace=T), ncol=10, byrow=TRUE)
library(microbenchmark)
microbenchmark(diffMatrix(m2), ff(m2), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# diffMatrix(m2) 6972.9778 7049.3455 7427.807 7633.7581 11337.3154 10
# ff(m2) 422.3195 469.5771 530.470 661.8299 862.3092 10
all.equal(diffMatrix(m2), ff(m2))
#[1] TRUE