我在数据框中有一列X
,为此我需要找到累积的标准偏差。
X Cumulative.SD
1 -
4 2.12
5 2.08
6 2.16
9 2.91
答案 0 :(得分:7)
基本概念
x <- c(1, 4, 5, 6, 9)
## cumulative sample size
n <- seq_along(x)
## cumulative mean
m <- cumsum(x) / n
#[1] 1.000000 2.500000 3.333333 4.000000 5.000000
## cumulative squared mean
m2 <- cumsum(x * x) / n
#[1] 1.0 8.5 14.0 19.5 31.8
## cumulative variance
v <- (m2 - m * m) * (n / (n - 1))
#[1] NaN 4.500000 4.333333 4.666667 8.500000
## cumulative standard deviation
s <- sqrt(v)
#[1] NaN 2.121320 2.081666 2.160247 2.915476
实用程序功能
cummean <- function (x) cumsum(x) / seq_along(x)
cumvar <- function (x, sd = FALSE) {
x <- x - x[sample.int(length(x), 1)] ## see Remark 2 below
n <- seq_along(x)
v <- (cumsum(x ^ 2) - cumsum(x) ^ 2 / n) / (n - 1)
if (sd) v <- sqrt(v)
v
}
## Rcpp version: `cumvar_cpp`
library(Rcpp)
cppFunction('NumericVector cumvar_cpp(NumericVector x, bool sd) {
int n = x.size();
NumericVector v(n);
srand(time(NULL));
double pivot = x[rand() % n];
double *xx = &x[0], *xx_end = &x[n], *vv = &v[0];
int i = 0; double xi, sum2 = 0.0, sum = 0.0, vi;
for (; xx < xx_end; xx++, vv++, i++) {
xi = *xx - pivot;
sum += xi; sum2 += xi * xi;
vi = (sum2 - (sum * sum) / (i + 1)) / i;
if (sd) vi = sqrt(vi);
*vv = vi;
}
return v;
}')
速度
cumvar
和cumvar_cpp
之所以快速,有两个原因:
O(n)
,而不是O(n ^ 2)
。对于越来越长的矢量,它们比简单的滚动计算要快。
x <- rnorm(1e+3)
library(microbenchmark)
library(TTR)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, TRUE),
"Rich" = vapply(seq_along(x), function(i) sd(x[1:i]), 1),
"akrun" = runSD(x, n = 1, cumulative = TRUE))
#Unit: microseconds
# expr min lq mean median uq max
# zheyuan 101.261 105.2505 118.85214 121.0040 128.5925 157.702
# zheyuan_cpp 69.749 72.7190 81.38878 82.2335 84.2820 213.193
# Rich 74595.329 75201.9420 77533.38803 75814.6945 77465.9945 136099.832
# akrun 4329.892 4436.0145 4710.82440 4669.8380 4715.6035 6908.231
x <- rnorm(1e+4)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, TRUE),
"akrun" = runSD(x, n = 1, cumulative = TRUE))
#Unit: microseconds
# expr min lq mean median uq
# zheyuan 842.844 863.676 997.9585 880.2245 968.077
# zheyuan_cpp 618.823 632.254 709.1971 639.2990 657.366
# akrun 147279.410 148200.370 150839.8161 149599.6170 151981.069
x <- rnorm(1e+5)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, TRUE),
"akrun" = runSD(x, n = 1, cumulative = TRUE),
times = 10)
#Unit: milliseconds
# expr min lq mean median uq
# zheyuan 8.446502 8.657557 22.531637 9.431389 11.082594
# zheyuan_cpp 6.189955 6.305053 6.698292 6.365656 6.812374
# akrun 14477.847050 14559.844609 14787.200820 14755.526655 15021.524429
备注1
我搜索了“累积方差R”,并发现了一个小包cumstats
。它具有cumvar
函数,但是是用sapply
编写的(例如Rich Scriven's answer),因此不需要我进行实验。
备注2
感谢Benjamin Christoffersen's professional elaboration。我在原始cumvar
中添加了以下几行以增强数值稳定性。
x <- x - x[sample.int(length(x), 1)]
然后与Ben的roll_var
相比,它返回正确的值。
## using Ben's example
set.seed(99858398)
x <- rnorm(1e2, mean = 1e8)
all.equal(cumvar_cpp(x, FALSE), base::c(roll_var(x)))
#[1] TRUE
计算累积方差的速度比较:
x <- rnorm(1e+6)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, FALSE),
"Ben_cpp" = roll_var(x),
times = 20)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zheyuan 85.47676 87.36403 91.05656 89.64444 93.99912 102.04964 20
# zheyuan_cpp 42.27983 42.41443 44.29919 42.65548 46.43293 51.24379 20
# Ben_cpp 46.99105 47.12178 49.48072 47.76016 50.44587 60.11491 20
x <- rnorm(1e+7)
microbenchmark("zheyuan" = cumvar(x, TRUE),
"zheyuan_cpp" = cumvar_cpp(x, FALSE),
"Ben_cpp" = roll_var(x),
times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# zheyuan 1171.3624 1215.8870 1278.3862 1266.9576 1330.6168 1486.7895 10
# zheyuan_cpp 463.6257 473.2711 479.8156 476.8822 482.4766 512.0520 10
# Ben_cpp 571.7481 583.2694 587.9993 584.1206 602.0050 605.1515 10
答案 1 :(得分:2)
我们可以使用runSD
TTR::runSD(df1$X, n = 1, cumulative = TRUE)
#[1] NA 2.121320 2.081666 2.160247 2.915476
df1 <- data.frame(X = c(1, 4, 5, 6, 9))
答案 2 :(得分:2)
您还可以检查Wiki网站上的Algorithms for calculating variance并使用Rcpp
来实现Welford's Online algorithm,如下所示
library(Rcpp)
func <- cppFunction(
"arma::vec roll_var(arma::vec &X){
const arma::uword n_max = X.n_elem;
double xbar = 0, M = 0;
arma::vec out(n_max);
double *x = X.begin(), *o = out.begin();
for(arma::uword n = 1; n <= n_max; ++n, ++x, ++o){
double tmp = (*x - xbar);
xbar += (*x - xbar) / n;
M += tmp * (*x - xbar);
if(n > 1L)
*o = M / (n - 1.);
}
if(n_max > 0)
out[0] = std::numeric_limits<double>::quiet_NaN();
return out;
}", depends = "RcppArmadillo")
# it gives the same
x <- c(1, 4, 5, 6, 9)
drop(func(x))
#R [1] NaN 4.50 4.33 4.67 8.50
sapply(seq_along(x), function(i) var(x[1:i]))
#R [1] NA 4.50 4.33 4.67 8.50
# it is fast
x <- rnorm(1e+3)
microbenchmark::microbenchmark(
func = func(x),
sapply = sapply(seq_along(x), function(i) var(x[1:i])))
#R Unit: microseconds
#R expr min lq mean median uq max neval
#R func 9.09 9.88 30.7 20.5 21.9 1189 100
#R sapply 43183.49 45040.29 47043.5 46134.4 47309.7 80345 100
取平方根为您提供标准偏差。
此方法的主要优点是cancellation没有问题。例如,比较
# there are no issues with cancellation
set.seed(99858398)
x <- rnorm(1e2, mean = 1e8)
cumvar <- function (x, sd = FALSE) {
n <- seq_along(x)
v <- (cumsum(x ^ 2) - cumsum(x) ^ 2 / n) / (n - 1)
if (sd) v <- sqrt(v)
v
}
z1 <- drop(func(x))[-1]
z2 <- cumvar(x)[-1]
plot(z1, ylim = range(z1, z2), type = "l", lwd = 2)
lines(seq_along(z2), z2, col = "DarkBlue")
蓝线是您从均值平方中减去平方值的算法。
答案 3 :(得分:1)
您可以使用sapply()
。
sapply(seq_along(X), function(i) sd(X[1:i]))
# [1] NA 2.121320 2.081666 2.160247 2.915476
或更快一点的vapply()
。
vapply(seq_along(X), function(i) sd(X[1:i]), 1)
# [1] NA 2.121320 2.081666 2.160247 2.915476