我有一个问题是找到一种最有效的方法来计算具有多列的xts对象的滚动线性回归。我已经在stackoverflow上搜索并阅读了之前的几个问题。
这个question and answer接近但我认为还不够,因为我想计算所有回归中因变量不变的多次回归。我试图用随机数据重现一个例子:
require(xts)
require(RcppArmadillo) # Load libraries
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE) # Random data
data[1000:1500, 2] <- NA # insert NAs to make it more similar to true data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
NR <- nrow(data) # number of observations
NC <- ncol(data) # number of factors
obs <- 30 # required number of observations for rolling regression analysis
info.names <- c("res", "coef")
info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names
创建数组是为了随时间和因素存储多个变量(残差,系数等)。
loop.begin.time <- Sys.time()
for (j in 2:NC) {
cat(paste("Processing residuals for factor:", j), "\n")
for (i in obs:NR) {
regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
residuals.temp <- regression.temp$residuals
info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
info[i, "coef", j] <- regression.temp$coefficients[2]
}
}
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time) # prints the loop runtime
由于循环显示的想法是每次针对其中一个因素运行30个观察滚动回归,其中data[, 1]
作为因变量(因子)。我必须将30个残差存储在临时对象中,以便将它们标准化,因为fastLm
不会计算标准化残差。
如果xts对象中的列数(因子)增加到100左右,那么循环非常慢并且变得很麻烦 - 1,000列将需要永恒。我希望有一个更有效的代码来创建大型数据集的滚动回归。
答案 0 :(得分:10)
如果你达到线性回归数学的水平,它应该很快。如果X是自变量而Y是因变量。系数由
给出 Beta = inv(t(X) %*% X) %*% (t(X) %*% Y)
我对你希望成为哪个变量以及哪个是独立变量感到有点困惑,但希望下面解决类似的问题对你也有帮助。
在下面的示例中,我使用1000个变量而不是原始的5个,并且不引入任何NA。
require(xts)
data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE) # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
NR <- nrow(data) # number of observations
NC <- ncol(data) # number of factors
obs <- 30 # required number of observations for rolling regression analysis
现在我们可以使用Joshua的TTR包计算系数。
library(TTR)
loop.begin.time <- Sys.time()
in.dep.var <- data[,1]
xx <- TTR::runSum(in.dep.var*in.dep.var, obs)
coeffs <- do.call(cbind, lapply(data, function(z) {
xy <- TTR::runSum(z * in.dep.var, obs)
xy/xx
}))
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time) # prints the loop runtime
3.934461秒的时差
res.array = array(NA, dim=c(NC, NR, obs))
for(z in seq(obs)) {
res.array[,,z] = coredata(data - lag.xts(coeffs, z-1) * as.numeric(in.dep.var))
}
res.sd <- apply(res.array, c(1,2), function(z) z / sd(z))
如果我在索引res.sd
中没有出现任何错误,应该为您提供标准化残差。请随时修复此解决方案以纠正任何错误。
答案 1 :(得分:0)
使用rollRegres
软件包可以更快地完成此操作
library(xts)
library(RcppArmadillo)
#####
# simulate data
set.seed(50554709)
data <- matrix(sample(1:10000, 1500), 1500, 5, byrow = TRUE) # Random data
# data[1000:1500, 2] <- NA # only focus on the parts that are computed
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
#####
# setup for solution in OP
NR <- nrow(data)
NC <- ncol(data)
obs <- 30L
info.names <- c("res", "coef")
info <- array(NA, dim = c(NR, length(info.names), NC))
colnames(info) <- info.names
#####
# solve with rollRegres
library(rollRegres)
loop.begin.time <- Sys.time()
X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
fit <- roll_regres.fit(
y = data[, j], x = X, width = obs, do_compute = c("sigmas"))
# are you sure you want the residual of the first and not the last
# observation in each window?
idx <- 1:(nrow(data) - obs + 1L)
idx_tail <- idx + obs - 1L
resids <- c(rep(NA_real_, obs - 1L),
data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))
# the package uses the unbaised estimator so we have to time by this factor
# to get the same
sds <- fit$sigmas * sqrt((obs - 2L) / (obs - 1L))
unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.03123808 secs
#####
# solve with original method
loop.begin.time <- Sys.time()
for (j in 2:NC) {
cat(paste("Processing residuals for factor:", j), "\n")
for (i in obs:NR) {
regression.temp <- fastLm(data[i:(i-(obs-1)), j] ~ data[i:(i-(obs-1)), 1])
residuals.temp <- regression.temp$residuals
info[i, "res", j] <- round(residuals.temp[1] / sd(residuals.temp), 4)
info[i, "coef", j] <- regression.temp$coefficients[2]
}
}
#R Processing residuals for factor: 2
#R Processing residuals for factor: 3
#R Processing residuals for factor: 4
#R Processing residuals for factor: 5
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time) # prints the loop runtime
#R Time difference of 7.554767 secs
#####
# check that results are the same
all.equal(info[, "coef", 2L], out[[1]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 2L], out[[1]][, "res"])
#R [1] TRUE
all.equal(info[, "coef", 3L], out[[2]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 3L], out[[2]][, "res"])
#R [1] TRUE
all.equal(info[, "coef", 4L], out[[3]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 4L], out[[3]][, "res"])
#R [1] TRUE
all.equal(info[, "coef", 5L], out[[4]][, "coef"])
#R [1] TRUE
all.equal(info[, "res" , 5L], out[[4]][, "res"])
#R [1] TRUE
请注意上述解决方案中的此评论
# are you sure you want the residual of the first and not the last
# observation in each window?
library(rollRegres)
require(xts)
data <- matrix(sample(1:10000, 1500000, replace=T), 1500, 1000, byrow = TRUE) # Random data
data <- xts(data, order.by = as.Date(1:1500, origin = "2000-01-01"))
NR <- nrow(data) # number of observations
NC <- ncol(data) # number of factors
obs <- 30 # required number of observations for rolling regression analysis
loop.begin.time <- Sys.time()
X <- cbind(1, drop(data[, 1]))
out <- lapply(2:NC, function(j){
fit <- roll_regres.fit(
y = data[, j], x = X, width = obs, do_compute = c("sigmas"))
# are you sure you want the residual of the first and not the last
# observation in each window?
idx <- 1:(nrow(data) - obs + 1L)
idx_tail <- idx + obs - 1L
resids <- c(rep(NA_real_, obs - 1L),
data[idx, j] - rowSums(fit$coefs[idx_tail, ] * X[idx, ]))
# the package uses the unbaised estimator so we have to time by this factor
# to get the same
sds <- fit$sigmas * sqrt((obs - 2L) / (obs - 1L))
unclass(cbind(coef = fit$coefs[, 2L], res = drop(round(resids / sds, 4))))
})
loop.end.time <- Sys.time()
print(loop.end.time - loop.begin.time)
#R Time difference of 0.9019711 secs
时间包括用于计算标准化残差的时间。