是否有一种简单快捷的方法来获取 R 中整数向量中出现的每个整数的频率?
到目前为止,我的尝试是:
x <- floor(runif(1000000)*1000)
print('*** using TABLE:')
system.time(as.data.frame(table(x)))
print('*** using HIST:')
system.time(hist(x,breaks=min(x):(max(x)+1),plot=FALSE,right=FALSE))
print('*** using SORT')
system.time({cdf<-cbind(sort(x),seq_along(x)); cdf<-cdf[!duplicated(cdf[,1]),2]; c(cdf[-1],length(x)+1)-cdf})
print('*** using ECDF')
system.time({i<-min(x):max(x); cdf<-ecdf(x)(i)*length(x); cdf-c(0,cdf[-length(i)])})
print('*** counting in loop')
system.time({h<-rep(0,max(x)+1);for(i in seq_along(x)){h[x[i]]<-h[x[i]]+1}; h})
#print('*** vectorized summation') #This uses too much memory if x is large
#system.time(colSums(matrix(rbind(min(x):max(x))[rep(1,length(x)),]==x,ncol=max(x)-min(x)+1)))
#Note: There are some fail cases in some of the above methods that need patching if, for example, there is a chance that some integer bins are unoccupied
以下是结果:
[1] "*** using TABLE:"
user system elapsed
1.26 0.03 1.29
[1] "*** using HIST:"
user system elapsed
0.11 0.00 0.10
[1] "*** using SORT"
user system elapsed
0.22 0.02 0.23
[1] "*** using ECDF"
user system elapsed
0.17 0.00 0.17
[1] "*** counting in loop"
user system elapsed
3.12 0.00 3.12
正如您所看到的,table
速度非常慢,hist
似乎是最快的。但是hist
(因为我正在使用它)正在研究任意可指定的断点,而我只是想要整数。难道没有办法交换这种灵活性以获得更好的性能吗?
在 C 中,for(i=0;i<1000000;i++)h[x[i]]++;
会非常快。
答案 0 :(得分:6)
最快的是使用tabulate
,但它需要正整数作为输入,因此您必须进行快速单调转换。
set.seed(21)
x <- as.integer(runif(1e6)*1000)
system.time({
adj <- 1L - min(x)
y <- setNames(tabulate(x+adj), sort(unique(x)))
})
答案 1 :(得分:4)
不要忘记您可以在R中内联C ++代码。
library(inline)
src <- '
Rcpp::NumericVector xa(a);
int n_xa = xa.size();
int test = max(xa);
Rcpp::NumericVector xab(test);
for (int i = 0; i < n_xa; i++)
xab[xa[i]-1]++;
return xab;
'
fun <- cxxfunction(signature(a = "numeric"),src, plugin = "Rcpp")
答案 2 :(得分:2)
我认为制表或C ++版本是要走的路,但是这里有一些使用rbenchmark的代码,这是一个很好的查看时间的包(我还添加了一些较慢的函数测试):
######################
### ---Clean Up--- ###
######################
rm(list = ls())
gc()
######################
### ---Packages--- ###
#####################
require(parallel)
require(data.table)
require(rbenchmark)
require(inline)
#######################
### ---Functions--- ###
#######################
# Competitor functions by Breyal
Breyal.using_datatable <- function(x) {DT <- data.table(x = x, weight = 1, key = "x"); DT[, length(weight), by = x]}
Breyal.using_lapply_1c_eq <- function(x = sort(x)) { lapply(unique(x), function(u) sum(x == u)) } # 1 core
Breyal.using_mclapply_8c_eq <- function(x = sort(x)) { mclapply(unique(x), function(u) sum(x == u), mc.cores = 8L) } # 8 cores
# Competitor functions by tennenrishin
tennenrishin.using_table <- function(x) as.data.frame(table(x))
tennenrishin.using_hist <- function(x) hist(x,breaks=min(x):(max(x)+1),plot=FALSE,right=FALSE)
tennenrishin.using_sort <- function(x) {cdf<-cbind(sort(x),seq_along(x)); cdf<-cdf[!duplicated(cdf[,1]),2]; c(cdf[-1],length(x)+1)-cdf}
tennenrishin.using_ecdf <- function(x) {i<-min(x):max(x); cdf<-ecdf(x)(i)*length(x); cdf-c(0,cdf[-length(i)])}
tennenrishin.using_counting_loop <- function(x) {h<-rep(0,max(x)+1);for(i in seq_along(x)){h[x[i]]<-h[x[i]]+1}; h}
# Competitor function by Ulrich
Ulrich.using_tabulate <- function(x) {
adj <- 1L - min(x)
y <- setNames(tabulate(x+adj), sort(unique(x)))
return(y)
}
# I couldn't get the Joe's C++ version to work (my laptop won't install inline) butI suspect that would be the fastest solution
##################
### ---Data--- ###
##################
# Set seed so results are reproducable
set.seed(21)
# Data vector
x <- floor(runif(1000000)*1000)
#####################
### ---Timings--- ###
#####################
# Benchmarks using Ubuntu 13.04 x64 with 8GB RAM and i7-2600K CPU @ 3.40GHz
benchmark(replications = 5,
tennenrishin.using_table(x),
tennenrishin.using_hist(x),
tennenrishin.using_sort(x),
tennenrishin.using_ecdf(x),
tennenrishin.using_counting_loop(x),
Ulrich.using_tabulate(x),
Breyal.using_datatable(x),
Breyal.using_lapply_1c_eq(x),
Breyal.using_mclapply_8c_eq(x),
order = "relative")
这导致以下时间
test replications elapsed relative user.self sys.self user.child sys.child
6 Ulrich.using_tabulate(x) 5 0.176 1.000 0.176 0.000 0.00 0.000
2 tennenrishin.using_hist(x) 5 0.468 2.659 0.468 0.000 0.00 0.000
3 tennenrishin.using_sort(x) 5 0.687 3.903 0.688 0.000 0.00 0.000
4 tennenrishin.using_ecdf(x) 5 0.749 4.256 0.748 0.000 0.00 0.000
7 Breyal.using_datatable(x) 5 2.960 16.818 2.960 0.000 0.00 0.000
1 tennenrishin.using_table(x) 5 4.651 26.426 4.596 0.052 0.00 0.000
9 Breyal.using_mclapply_8c_eq(x) 5 10.817 61.460 0.140 1.196 54.62 7.112
5 tennenrishin.using_counting_loop(x) 5 10.922 62.057 10.912 0.000 0.00 0.000
8 Breyal.using_lapply_1c_eq(x) 5 36.807 209.131 36.768 0.000 0.00 0.000