将数据帧中的列i乘以另一个中的列i的元素的最快方法?

时间:2016-04-08 16:02:04

标签: r data.table dplyr

假设我有以下数据:

N <- 4000
sdY <- 142
sdX <- data.frame(name1 =1.37, name2 = 3.84, name3=14.89, name4=226.47)
X1 <- data.frame(name1 = rnorm(N,0,1), name2 = rnorm(N,0,1), name3 = rnorm(N,0,1), name4 = rnorm(N,0,1))
z <- sdY/sdX

我想创建一个如下所示的新数据框:

V1 <- as.data.frame(t(t(X1) * sdY/as.numeric(sdX)))

我可以想到另外四种方法来生成相同的数据框:

V2 <- as.data.frame(as.matrix(X1) %*% diag(sdY/sdX))
all.equal(V1, V2)
names(V2) <- names(sdX)
V3 <- as.data.frame(X1*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X1), ncol=ncol(X1), byrow = T))
all.equal(V1, V3)
V4 <- as.data.frame(sapply(1:ncol(X1), function(k)X1[,k]*(sdY/as.numeric(sdX))[k]))
names(V4) <- names(sdX)
all.equal(V1, V4)
V5 <- as.data.frame(mapply(`*`, X1, z))
all.equal(V1, V5)

我做了一个基准测试,它看起来像最快的V1

library(microbenchmark)
microbenchmark(V1 = as.data.frame(t(t(X1) * sdY/as.numeric(sdX))),
               V2 = as.data.frame(as.matrix(X1) %*% diag(sdY/sdX)),
               V3 = as.data.frame(X1*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X1), ncol=ncol(X1), byrow = T)),
               V4 = as.data.frame(sapply(1:ncol(X1), function(k)X1[,k]*(sdY/as.numeric(sdX))[k])),
               V5 = as.data.frame(mapply(`*`, X1, z)),
               times = 1000)

Unit: microseconds
expr      min        lq      mean    median        uq        max neval
V1  325.556  348.5945  392.7839  360.1845  380.2265   3204.285  1000
V2  564.480  597.2055  656.6340  622.5305  661.1400  10060.082  1000
V3 1513.979 1592.2820 1903.7942 1641.5110 1724.2650 178914.886  1000
V4  376.862  404.4905  441.4825  417.5230  435.9220   3650.707  1000
V5 4866.712 5044.0630 5231.5006 5147.9860 5298.7725   8174.186  1000

有更好的方法吗?你能用dplyr做到这一点吗? data.table

如果初始数据不是数据帧,我可以这样做:

N <- 4000
sdY <- 142
sdX <- c(name1 = 1.37, name2 = 3.84, name3 = 14.89, name4 = 226.47) 
X <- cbind(name1 = rnorm(N,0,1), name2 = rnorm(N,0,1), name3 = rnorm(N,0,1), name4 = rnorm(N,0,1))

library(microbenchmark)
microbenchmark(V1 = t(t(X) * sdY/sdX),
               V2 = as.matrix(X) %*% diag(sdY/sdX),
               V3 = X*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X), ncol=ncol(X), byrow = T),
               V4 = sapply(1:ncol(X), function(k)X[,k]*(sdY/as.numeric(sdX))[k]),
               V5 = mapply(`*`, X, sdY/sdX),
               V6 = X * rep( sdY/sdX, each = N ),
               times = 1000)

Unit: microseconds
expr       min         lq        mean     median         uq       max neval
V1   128.222   133.3330   143.08527   138.0615   147.4370   243.614  1000
V2    61.047    66.9935    75.10065    71.3025    79.1750   136.405  1000
V3   175.991   181.0220   194.86671   187.3085   202.5145   309.282  1000
V4   321.646   336.9605   364.27685   350.2760   376.5665   784.877  1000
V5 10250.179 11432.2920 12571.32181 11989.4650 12988.0405 21276.846  1000
V6   602.723   608.6780   648.34465   626.7520   656.6065  1004.817  1000 

如果我将N增加到N <- 4000000

microbenchmark(V1 = t(t(X) * sdY/sdX),
               V2 = as.matrix(X) %*% diag(sdY/sdX),
               V3 = X*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X), ncol=ncol(X), byrow = T),
               V4 = sapply(1:ncol(X), function(k)X[,k]*(sdY/as.numeric(sdX))[k]),
               V6 = X * rep( sdY/sdX, each = N ),
               times = 100)


Unit: milliseconds
expr      min        lq      mean    median        uq      max neval
V1 171.4577 176.38111 183.94015 181.04543 184.04083 359.7489   100
V2  70.1990  72.72581  77.85295  73.83967  78.74599 256.6801   100
V3 204.3800 209.07647 218.43955 212.89379 215.72365 449.2121   100
V4 320.1775 327.35325 344.21326 330.75827 336.29847 535.5430   100
V6 639.3619 646.63353 664.39093 650.70151 655.15810 908.4869   100

0 个答案:

没有答案