假设我有以下数据:
N <- 4000
sdY <- 142
sdX <- data.frame(name1 =1.37, name2 = 3.84, name3=14.89, name4=226.47)
X1 <- data.frame(name1 = rnorm(N,0,1), name2 = rnorm(N,0,1), name3 = rnorm(N,0,1), name4 = rnorm(N,0,1))
z <- sdY/sdX
我想创建一个如下所示的新数据框:
V1 <- as.data.frame(t(t(X1) * sdY/as.numeric(sdX)))
我可以想到另外四种方法来生成相同的数据框:
V2 <- as.data.frame(as.matrix(X1) %*% diag(sdY/sdX))
all.equal(V1, V2)
names(V2) <- names(sdX)
V3 <- as.data.frame(X1*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X1), ncol=ncol(X1), byrow = T))
all.equal(V1, V3)
V4 <- as.data.frame(sapply(1:ncol(X1), function(k)X1[,k]*(sdY/as.numeric(sdX))[k]))
names(V4) <- names(sdX)
all.equal(V1, V4)
V5 <- as.data.frame(mapply(`*`, X1, z))
all.equal(V1, V5)
我做了一个基准测试,它看起来像最快的V1
:
library(microbenchmark)
microbenchmark(V1 = as.data.frame(t(t(X1) * sdY/as.numeric(sdX))),
V2 = as.data.frame(as.matrix(X1) %*% diag(sdY/sdX)),
V3 = as.data.frame(X1*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X1), ncol=ncol(X1), byrow = T)),
V4 = as.data.frame(sapply(1:ncol(X1), function(k)X1[,k]*(sdY/as.numeric(sdX))[k])),
V5 = as.data.frame(mapply(`*`, X1, z)),
times = 1000)
Unit: microseconds
expr min lq mean median uq max neval
V1 325.556 348.5945 392.7839 360.1845 380.2265 3204.285 1000
V2 564.480 597.2055 656.6340 622.5305 661.1400 10060.082 1000
V3 1513.979 1592.2820 1903.7942 1641.5110 1724.2650 178914.886 1000
V4 376.862 404.4905 441.4825 417.5230 435.9220 3650.707 1000
V5 4866.712 5044.0630 5231.5006 5147.9860 5298.7725 8174.186 1000
有更好的方法吗?你能用dplyr
做到这一点吗? data.table
?
如果初始数据不是数据帧,我可以这样做:
N <- 4000
sdY <- 142
sdX <- c(name1 = 1.37, name2 = 3.84, name3 = 14.89, name4 = 226.47)
X <- cbind(name1 = rnorm(N,0,1), name2 = rnorm(N,0,1), name3 = rnorm(N,0,1), name4 = rnorm(N,0,1))
library(microbenchmark)
microbenchmark(V1 = t(t(X) * sdY/sdX),
V2 = as.matrix(X) %*% diag(sdY/sdX),
V3 = X*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X), ncol=ncol(X), byrow = T),
V4 = sapply(1:ncol(X), function(k)X[,k]*(sdY/as.numeric(sdX))[k]),
V5 = mapply(`*`, X, sdY/sdX),
V6 = X * rep( sdY/sdX, each = N ),
times = 1000)
Unit: microseconds
expr min lq mean median uq max neval
V1 128.222 133.3330 143.08527 138.0615 147.4370 243.614 1000
V2 61.047 66.9935 75.10065 71.3025 79.1750 136.405 1000
V3 175.991 181.0220 194.86671 187.3085 202.5145 309.282 1000
V4 321.646 336.9605 364.27685 350.2760 376.5665 784.877 1000
V5 10250.179 11432.2920 12571.32181 11989.4650 12988.0405 21276.846 1000
V6 602.723 608.6780 648.34465 626.7520 656.6065 1004.817 1000
如果我将N
增加到N <- 4000000
:
microbenchmark(V1 = t(t(X) * sdY/sdX),
V2 = as.matrix(X) %*% diag(sdY/sdX),
V3 = X*sdY*matrix(1/as.numeric(sdX),nrow = nrow(X), ncol=ncol(X), byrow = T),
V4 = sapply(1:ncol(X), function(k)X[,k]*(sdY/as.numeric(sdX))[k]),
V6 = X * rep( sdY/sdX, each = N ),
times = 100)
Unit: milliseconds
expr min lq mean median uq max neval
V1 171.4577 176.38111 183.94015 181.04543 184.04083 359.7489 100
V2 70.1990 72.72581 77.85295 73.83967 78.74599 256.6801 100
V3 204.3800 209.07647 218.43955 212.89379 215.72365 449.2121 100
V4 320.1775 327.35325 344.21326 330.75827 336.29847 535.5430 100
V6 639.3619 646.63353 664.39093 650.70151 655.15810 908.4869 100