Question

我有简单的代码，但是使用小数据集进行一次迭代需要0.006秒，我很害怕，因为我会在大型数据集上使用它。这是我的代码

d1 = matrix(0, nrow(x), num.clust)
for(i in 1:ncol(d.num){
  #For numeric attribute
  d1.temp = (w*(d.num - matrix(rep(as.numeric(num.centroid[i,]), nrows),nrow = nrows, byrow = T)))^2

  d1[,i] = rowSums(d1.temp)
}

这是我用过的数据框

> head(d.num)
x3        x4
4.842316 11.754403
6.405585 11.643502
6.590780 11.478245
6.656699 11.293404
> num.centroid
     [,1]     [,2]
[1,] 7.605837 12.59816
[2,] 7.895469 12.92275

w是具有d.num大小的数据框。有没有建议减少这种情况的执行时间？

Answer 1

如果按照我的建议将数据帧转换为矩阵，则可以进行比较。

# original code
f1 <- function(n){
  d.num <- data.frame(x3=round(rnorm(n)), x4 = round(rnorm(n)))
  w <- d.num
  num.centroid <- matrix(c(7,8,9,10), nrow=2)
  nrows = nrow(d.num)
  d1 <- matrix(NA_real_, nrow=nrows, ncol=ncol(d.num))
  for(i in 1:ncol(d.num)){
    d1.temp = (w*(d.num - matrix(rep(num.centroid[i,], nrows), 
                                 nrow = nrows, byrow = TRUE)))^2

    d1[,i] = rowSums(d1.temp)
  }
  return(d1)
}
# using as.matrix
f2 <- function(n){
  d.num <- data.frame(x3=round(rnorm(n)), x4 = round(rnorm(n)))
  w <- d.num
  d.num <- as.matrix(d.num)
  w <- as.matrix(w)
  num.centroid <- matrix(c(7,8,9,10), nrow=2)
  nrows = nrow(d.num)
  d1 <- matrix(NA_real_, nrow=nrows, ncol=ncol(d.num))
  for(i in 1:ncol(d.num)){
    d1.temp = (w*(d.num - matrix(rep(num.centroid[i,], nrows), 
                                 nrow = nrows, byrow = TRUE)))^2

    d1[,i] = rowSums(d1.temp)
  }
  return(d1)
}

基准：

> library(microbenchmark)
> n <- 10
> microbenchmark(
+   code1 = f1(n),
+   code2 = f2(n),
+   times = 1000
+ )
Unit: microseconds
  expr      min        lq        mean   median       uq      max neval
 code1 1432.443 1480.8605 1628.889978 1545.789 1631.022 6229.565  1000
 code2  263.284  278.9020  313.293371  290.505  307.239 3138.880  1000
> n <- 1000
> microbenchmark(
+   code1 = f1(n),
+   code2 = f2(n),
+   times = 1000
+ )
Unit: microseconds
  expr      min       lq        mean   median        uq        max neval
 code1 1884.934 1924.873 2290.409508 1974.183 2111.8490 114038.521  1000
 code2  571.192  583.687  642.682019  601.537  637.4595   3499.891  1000

第二个代码显然更快。

R中的快速计算矩阵

1 个答案: