我在R中有一个非常大的字符矩阵,大约[500000,5],包含名称。每行可能包含重复的名称。我想知道每一行有多少不同的名字。据我所知,我无法对此循环中的任何函数进行矢量化,对吧?
例如:
sampleNames <- c("Bob", "Elliot", "Sarah")
# Dimensions [100000, 5]
mat <- matrix(sampleNames[round(runif(500000, 1, 3))], ncol = 5)
NamesPerRow <- vector()
startTime <- Sys.time()
for(i in 1:dim(mat)[1]){
NamesPerRow[i] <- length(unique(mat[i,]))
}
Sys.time() - startTime
我的机器只需20秒。非常容忍。但是,如果矩阵的行数是5倍,则循环所用的时间要长于100秒:
sampleNames <- c("Bob", "Elliot", "Sarah")
# Dimensions [500000, 5]
mat <- matrix(sampleNames[round(runif(2500000, 1, 3))], ncol = 5)
NamesPerRow <- vector()
startTime <- Sys.time()
for(i in 1:dim(mat)[1]){
NamesPerRow[i] <- length(unique(mat[i,]))
}
Sys.time() - startTime
我的机器需要13.12分钟。比100000x5矩阵长40倍。岂有此理!
我可以用来更快地执行这些操作的任何技巧?我可以在这里做任何事情吗?这是我用多线程解决的问题(我不熟悉)?
此外,这里发生了什么?计算时间的增加是否比我运行的数据快得多?
谢谢。
答案 0 :(得分:4)
您还可以使用rowTabulates
包
matrixStats
# Dimensions [500000, 5]
mat <- matrix(sampleNames[round(runif(2500000, 1, 3))], ncol = 5)
library(matrixStats)
startTime <- Sys.time()
mat1 <- matrix(match(mat, sampleNames), ncol=5)
b <- rowSums(rowTabulates(mat1)!=0)
Sys.time() - startTime
# Time difference of 0.2012889 secs
@Richard Scriven的 apply()
startTime <- Sys.time()
a <- apply(mat, 1, function(x) length(unique(x)))
Sys.time() - startTime
# Time difference of 4.231503 secs
all.equal(a, b)
# [1] TRUE
答案 1 :(得分:2)
您可以使用apply()
设计用于矩阵,从而节省大量时间。但是在这里你也可以通过分配返回向量而不是在循环中构建它来节省大量时间。
sampleNames <- c("Bob", "Elliot", "Sarah")
# Dimensions [100000, 5]
mat <- matrix(sampleNames[round(runif(500000, 1, 3))], ncol = 5)
这是我们为for()
循环分配返回向量的地方。
## instead of writing the generic vector() call
NamesPerRow <- vector("integer", nrow(mat))
现在使用您当前的方法,我们有:
system.time({
for(i in seq_along(NamesPerRow)) { ## seq_along() also slightly faster
NamesPerRow[i] <- length(unique(mat[i,]))
}
})
# user system elapsed
# 1.144 0.000 1.127
使用apply()
system.time({
a <- apply(mat, 1, function(x) length(unique(x)))
})
# user system elapsed
# 1.012 0.000 0.993
检查:
identical(NamesPerRow, a)
# [1] TRUE
因此,分配矢量的简单行为可以节省大量时间。
答案 2 :(得分:2)
只是提出第三个建议,你可以使用Rcpp:
library('Rcpp');
sampleNames <- c('Bob','Elliot','Sarah');
set.seed(1); mat <- matrix(sampleNames[round(runif(2500000,1,3))],ncol=5); ## 500000x5
head(mat);
## [,1] [,2] [,3] [,4] [,5]
## [1,] "Elliot" "Elliot" "Bob" "Elliot" "Elliot"
## [2,] "Elliot" "Sarah" "Elliot" "Sarah" "Elliot"
## [3,] "Elliot" "Elliot" "Elliot" "Bob" "Bob"
## [4,] "Sarah" "Bob" "Bob" "Sarah" "Sarah"
## [5,] "Bob" "Elliot" "Bob" "Bob" "Bob"
## [6,] "Sarah" "Bob" "Elliot" "Elliot" "Elliot"
cppFunction('
IntegerVector distinctByRow(IntegerMatrix mat) {
IntegerVector res(mat.nrow());
if (mat.ncol() == 0) return res;
std::vector<int> buf(mat.ncol());
for (size_t r = 0; r < mat.nrow(); ++r) {
IntegerMatrix::Row row = mat.row(r);
buf.assign(row.begin(),row.end());
std::sort(buf.begin(),buf.end());
int count = 1;
for (size_t c = 1; c < mat.ncol(); ++c)
if (buf[c] != buf[c-1])
++count;
res(r) = count;
}
return res;
}
');
res.rcpp <- distinctByRow(matrix(match(mat,sampleNames),nrow(mat)));
head(res.rcpp);
## [1] 2 2 2 2 2 3
## libs
library('Rcpp');
library('matrixStats');
## funcs
f.loop.grow <- function(mat) { res <- vector(); for (i in seq_len(nrow(mat))) res[i] <- length(unique(mat[i,])); res; };
f.loop.prealloc <- function(mat) { res <- vector('integer',nrow(mat)); for (i in seq_len(nrow(mat))) res[i] <- length(unique(mat[i,])); res; };
f.apply <- function(mat) apply(mat,1,function(x) length(unique(x)));
f.rowtab <- function(mat) rowSums(rowTabulates(matrix(match(mat,sampleNames),nrow(mat))) != 0L);
f.rcpp <- function(mat) distinctByRow(matrix(match(mat,sampleNames),nrow(mat)));
## data
sampleNames <- c('Bob','Elliot','Sarah');
set.seed(1); mat <- matrix(sampleNames[round(runif(2500000,1,3))],ncol=5); ## 500000x5
## proof of correctness
all.equal(f.loop.grow(mat),f.loop.prealloc(mat));
## [1] TRUE
all.equal(f.loop.prealloc(mat),f.apply(mat));
## [1] TRUE
all.equal(f.apply(mat),f.rowtab(mat));
## [1] TRUE
all.equal(f.rowtab(mat),f.rcpp(mat));
## [1] TRUE
## timing
microbenchmark(f.loop.grow(mat),f.loop.prealloc(mat),f.apply(mat),f.rowtab(mat),f.rcpp(mat),times=3L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## f.loop.grow(mat) 96624.4954 99011.9452 100625.0517 101399.3950 102625.3299 103851.2648 3
## f.loop.prealloc(mat) 3572.0831 3574.6325 3616.9598 3577.1820 3639.3982 3701.6145 3
## f.apply(mat) 3329.4926 3410.6111 3486.2511 3491.7296 3564.6304 3637.5311 3
## f.rowtab(mat) 259.8664 288.6030 299.2716 317.3395 318.9742 320.6089 3
## f.rcpp(mat) 122.1257 124.6957 163.4774 127.2657 184.1532 241.0407 3
我很好奇matrixStats::rowTabulates()
函数(在ExperimenteR的答案中使用)是如何在内部工作的,所以我查看了源代码。这是来自R/rowTabulates.R
:
rowTabulates <- function(x, values=NULL, ...) {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Validate arguments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Argument 'x':
if (is.integer(x)) {
} else if (is.raw(x)) {
} else {
stop("Argument 'x' is not of type integer or raw: ", class(x)[1]);
}
# Argument 'values':
if (is.null(values)) {
values <- as.vector(x);
values <- unique(values);
if (is.raw(values)) {
values <- as.integer(values);
values <- sort(values);
# WORKAROUND: Cannot use "%#x" because it gives an error OSX with
# R v2.9.0 devel (2009-01-13 r47593b) at R-forge. /HB 2009-06-20
names <- sprintf("%x", values);
names <- paste("0x", names, sep="");
values <- as.raw(values);
} else {
values <- sort(values);
names <- as.character(values);
}
} else {
if (is.raw(values)) {
names <- sprintf("%x", as.integer(values));
names <- paste("0x", names, sep="");
} else {
names <- as.character(values);
}
}
nbrOfValues <- length(values);
counts <- matrix(0L, nrow=nrow(x), ncol=nbrOfValues);
colnames(counts) <- names;
for (kk in seq(length=nbrOfValues)) {
counts[,kk] <- rowCounts(x, value=values[kk], ...);
}
counts;
}
最有趣的部分是最后的for
循环;它们实际上循环遍历输入中的每个唯一值并获取行数!我意识到如果输入中有许多唯一值,这可能会导致函数表现出较差的性能,而不像OP的示例数据那样我们只有三个。所以我做了另一个性能测试,这次有1000个唯一值,我还决定尝试使用更少的行和更多的列。如您所见,结果与我上面的结果完全相反。这真实地说明了算法的行为方式会有很大不同,具体取决于您向它们投入的数据!
## data 2 -- more names and columns
rstr <- function(N,charset=letters,lf=function(N) runif(N,trunc(lmin)-0.5,trunc(lmax)+0.5),lmin=1,lmax=10) {
charset <- as.character(charset);
len <- sort(as.integer(round(pmin(lmax,pmax(lmin,lf(N))))));
rl <- rle(len);
sample(do.call(c,Map(function(len,num) if (len == 0) rep('',num) else do.call(paste0,as.data.frame(matrix(sample(charset,len*num,replace=T),num))), rl$values, rl$lengths )));
};
set.seed(1); N <- 1e3; sampleNames <- rstr(N);
head(sampleNames);
## [1] "wcbzjxq" "etxjz" "ompognqack" "eufkli" "rworpwkk" "ghw"
mat <- matrix(sample(sampleNames,2500000,replace=T),ncol=500); ## 5000x500
head(mat[,1:6]);
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "qgrb" "gb" "pmiula" "wrx" "yr" "kejil"
## [2,] "ivaqaaek" "alen" "woenvkgkh" "zkocecowl" "mjgv" "ejqks"
## [3,] "nvz" "yr" "kyxmjjrnn" "vfzc" "tnm" "cnw"
## [4,] "ut" "jgexsepo" "jh" "ejqks" "iy" "galtchwmh"
## [5,] "ppxe" "bnpqxbj" "nvz" "ruulsigdzq" "hpuw" "rjsofvjev"
## [6,] "bdoxqim" "qr" "mgkkku" "agjdgjhv" "bdoxqim" "bdoxqim"
## proof of correctness 2
all.equal(f.loop.grow(mat),f.loop.prealloc(mat));
## [1] TRUE
all.equal(f.loop.prealloc(mat),f.apply(mat));
## [1] TRUE
all.equal(f.apply(mat),f.rowtab(mat));
## [1] TRUE
all.equal(f.rowtab(mat),f.rcpp(mat));
## [1] TRUE
## timing 2
microbenchmark(f.loop.grow(mat),f.loop.prealloc(mat),f.apply(mat),f.rowtab(mat),f.rcpp(mat),times=3L);
## Unit: milliseconds
## expr min lq mean median uq max neval
## f.loop.grow(mat) 153.3568 157.6669 167.5521 161.9770 174.6497 187.3223 3
## f.loop.prealloc(mat) 141.1644 142.8239 144.1546 144.4834 145.6497 146.8159 3
## f.apply(mat) 166.2976 177.0187 195.1381 187.7397 209.5583 231.3770 3
## f.rowtab(mat) 2590.8117 2623.3600 2665.5511 2655.9082 2702.9207 2749.9333 3
## f.rcpp(mat) 197.6206 197.7765 202.5478 197.9324 205.0113 212.0903 3