我正在尝试评估不同OLS模型的样本外预测性能。最简单的时间序列回归如下所示: Y_t = b0 + b1 * Y_t-30 + e_t
模型的拟合周期,比方说50,然后我让模型使用dynlm包运行
dynlm(as.zoo(Y) ~ L(as.zoo(Y), 30), start = "1996-01-01", end = timelist[i])
在我当前的代码中,我只是让索引i运行直到结束,然后我保存相应模型的RMSE。但是这个RMSE并不是超前预测的一步预测,因为我现在的代码已经很慢了,甚至没有完全按照我的意愿去做,我想问你是否有一个建议我应该用来实现我的目标。
总结一下,我想做以下事情:
1)在某个拟合周期(展开窗口,非滚动窗口)之后运行递归回归
2)创建一步到位的样本外预测
3)计算这些预测与实际观测值的均方根误差,以评估模型性能
到目前为止,我尝试使用巨大的for-loop和dynlm包,但结果并不令人满意。 任何输入都非常感谢,因为我一直在寻找解决方案已有一段时间了。我会在取得一些进展后立即更新我的示例代码。
# minimal working example
require(xts)
require(zoo)
require(dynlm)
timelist <- seq.Date(from = as.Date("1996-01-01"), to = as.Date("1998-01-01"), by = "days")
set.seed(123)
Y <- xts(rnorm(n = length(timelist)), order.by = timelist)
X <- xts(rnorm(n = length(timelist), mean = 10), order.by = timelist)
# rmse container
rmse.container.full <- data.frame(matrix(NA, ncol = 3, nrow = length(index(timelist))))
colnames(rmse.container.full) <- c("Date", "i", "rmse.m1")
rmse.container.full$Date <- timelist
# fitting period
for(i in 50:length(timelist)) {
# m1
model1 <- dynlm(as.zoo(Y) ~ L(as.zoo(X), 30), start = "1996-01-01", end = timelist[i])
rmse.container.full[i, 2] <- i
rmse.container.full[i, 3] <- summary(model1)$sigma # RSME mod1 etc
print(i)
}
答案 0 :(得分:1)
您可以使用我编写的rollRegres
包进行样本外预测,如下所示(它比以前的解决方案更快)
# simulate data
set.seed(101)
n <- 1000
X <- rnorm(n)
y <- 10 - X + rnorm(n)
dat <- data.frame(y = y, X)
# define wrapper to get out-of-sample predicted values
library(rollRegres)
wrapper <- function(formula, data, min_window_size){
out <- roll_regres(
formula = frm, data = data, width = min_window_size, do_downdates = FALSE,
do_compute = "1_step_forecasts")$one_step_forecasts
out[!is.na(out)]
}
# assign function to compare with
func <- function(formula, data, min_window_size){
sapply(seq(min_window_size, nrow(data) - 1L), function(x) {
mod1 <- lm(formula, data = data[1:x, ])
pred1 <- predict(mod1, newdata = data[x+1, ])
pred1
})
}
# show that the two gives the same
frm <- y ~ X
r1 <- wrapper(frm, dat, 25L)
r2 <- func (frm, dat, 25L)
all.equal(r1, r2, check.attributes = FALSE)
#R> [1] TRUE
# show run time
microbenchmark::microbenchmark(
func = func(frm, dat, 25L), roll_regres = wrapper(frm, dat, 25L),
times = 5)
#R> Unit: microseconds
#R> expr min lq mean median uq max neval
#R> func 1027213.048 1028723.765 1050103.171 1034833.792 1038513.793 1121231.455 5
#R> roll_regres 560.198 569.284 1073.778 610.766 636.445 2992.198 5
注意~ 560 / 1028700
相对计算时间。然后,您可以使用预测值as in your own answer计算RMSE。
使用
中的Fortran函数可以大大减少计算时间Miller,A.J。(1992)。算法AS 274:最小二乘例程 补充绅士的那些。皇家统计学报 社会。 C系列(应用统计学),41(2),458-478。
您可以使用此代码
执行此操作# simulate data
set.seed(101)
n <- 1000
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(10 + X %*% runif(10)) + rnorm(n)
dat <- data.frame(y = y, X)
# assign wrapper for biglm
biglm_wrapper <- function(formula, data, min_window_size){
mf <- model.frame(formula, data)
X <- model.matrix(terms(mf), mf)
y - model.response(mf)
n <- nrow(X)
p <- ncol(X)
storage.mode(X) <- "double"
storage.mode(y) <- "double"
w <- 1
qr <- list(
d = numeric(p), rbar = numeric(choose(p, 2)),
thetab = numeric(p), sserr = 0, checked = FALSE, tol = numeric(p))
nrbar = length(qr$rbar)
beta. <- numeric(p)
out <- numeric(n - min_window_size - 2)
for(i in 1:(n - 1)){
row <- X[i, ] # will be over written
qr[c("d", "rbar", "thetab", "sserr")] <- .Fortran(
"INCLUD", np = p, nrbar = nrbar, weight = w, xrow = row, yelem = y[i],
d = qr$d, rbar = qr$rbar, thetab = qr$thetab, sserr = qr$sserr, ier = i - 0L,
PACKAGE = "biglm")[
c("d", "rbar", "thetab", "sserr")]
if(i >= min_window_size){
coef. <- .Fortran(
"REGCF", np = p, nrbar = nrbar, d = qr$d, rbar = qr$rbar,
thetab = qr$thetab, tol = qr$tol, beta = beta., nreq = p, ier = i,
PACKAGE = "biglm")[["beta"]]
out[i - min_window_size + 1] <- coef. %*% X[i + 1, ]
}
}
out
}
# assign function to compare with
func <- function(formula, data, min_window_size){
sapply(seq(min_window_size, nrow(data)-1), function(x) {
mod1 <- lm(formula, data = data[1:x, ])
pred1 <- predict(mod1, newdata = data[x+1, ])
pred1
})
}
# show that the two gives the same
frm <- y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10
r1 <- biglm_wrapper(frm, dat, 25)
r2 <- func(frm, dat, 25)
all.equal(r1, r2, check.attributes = FALSE)
#R> [1] TRUE
# show run time
microbenchmark::microbenchmark(
r1 = biglm_wrapper(frm, dat, 25), r2 = f2(frm, dat, 25),
times = 5)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> r1 9.976505 10.00653 11.85052 10.53157 13.36377 15.37424 5 a
#R> r2 1095.944410 1098.29661 1122.17101 1098.58264 1113.48833 1204.54306 5 b
答案 1 :(得分:0)
好吧,作为提出这个问题的人,我想贡献我如何解决我的问题:
由于我只需要提前一步预测,我可以扔掉其他所有东西,这使代码运行得更快。 (每种型号从12分钟到10秒钟。)
我自己创建了完整的数据帧(包括滞后)并使用lm而不是dynlm。 下面的代码给了我我想要的结果(我手动检查了前几个观察结果,似乎有效)。代码改编自:Recursive regression in R
mod1.predictions <- lapply( seq(1400, nrow(df.full)-1), function(x) {
mod1 <- lm(Y ~ X, data = df.full[1:x, ])
pred1 <- predict(mod1, newdata = df.full[x+1, ])
return(pred1)
})
为了计算RMSE,我使用了这个函数
# rmse function
rmse <- function(sim, obs) {
res <- sqrt( mean( (sim - obs)^2, na.rm = TRUE) )
res
}
感谢CrossValidated的提示,它帮助了很多。