我正在尝试构建一个预测模型,该模型使用插入符包来预测变量pop
与经济数据集的对数差异。
library(dplyr)
library(caret)
my_data <- data.frame(economics)
# make the Y variable "pop" into a leading variable so I am predicting the future value
new_data <- cbind(my_data, lead(my_data$pop, n = 1))
# renaming the leading pop into Y and the other pop into pop_lag
names(new_data)[7] <- "Y"
names(new_data)[3] <- "pop_lag"
# date into rownames
new_data <- rownames_date(new_data)
# lagging variables to use as prediction variables.
new_data <- do_lag(new_data, names(new_data)[-c(2,6)])
new_data <- na.omit(new_data)
# take the logdiff of all the time series
new_data <- logdiff_func(new_data)
new_data <- na.omit(new_data)
# create training and testing sets
trainIndex <- 1:round((0.7*nrow(new_data)),1)
new_train <- new_data[ trainIndex,]
new_test <- new_data[-trainIndex,]
# train a randomforest model on the training data
rf_model <- train(Y ~ ., data = new_train, method = 'rf',
trControl = trainControl(method = 'timeslice',
initialWindow = 50,
horizon = 15,
fixedWindow = TRUE),
importance = TRUE,
tuneGrid = data.frame(mtry = 1:(ncol(new_data)-1)))
# make predictions on testing data
rf.fit <- predict(rf_model, new_test)
# plotting results
tick.dates <- c('1970-01-01', '1980-01-01', '1990-01-01', '2000-01-01', '2010-01-01')
rf.pred <- data.frame(Month = rownames(new_data), actual = new_data$Y,
predicted = c(rep(NA, dim(new_data)[1] - length(rf.fit)),
rf.fit))
ggplot(rf.pred, aes(x = Month, y = actual)) + geom_point(alpha = .7) +
geom_point(aes(y = predicted), color = 'red', alpha = .7, shape = 3) +
scale_x_discrete(breaks = tick.dates) +
labs(title = 'Time-sliced random forest actual (black) and predicted (red)',
x = 'Month', y = 'logdiff Unemployment')
plot(rf_model)
# MSE and RMSE
MSE(new_test$Y,rf.fit)
sqrt(MSE(new_test$Y,rf.fit))
这是训练模型和预测的正确方法吗?所有的预测是否领先一步预测?以下是我使用的功能:
do_lag <- function(data, variables, num_periods = 1){
num_vars <- length(variables)
num_rows <- nrow(data)
for(j in 1:num_vars){
for(i in 1:num_periods){
data[[paste0(variables[j], "_lag")]] <- c(rep(NA, i), head(data[[variables[j]]], num_rows - i))
}
}
return(data)
}
MSE <- function(testY, fit) {
error <- testY - fit
return(sum(error**2) / length(error))
}
rownames_date <- function(data){
for(i in 1:ncol(data)){
if(names(data)[i] == "Date" || names(data)[i] == "date"){
rownames(data) <- data[,i]
data[,i] <- NULL
return(data)
}
}
}
logdiff_func <- function(data){
data <- na.omit(data)
i <- 1
for(i in 1:length(data)){
data[, i][1:(nrow(data))] <- log(data[, i])
}
i <- 1
for(i in 1:length(data)){
data[, i][1:(nrow(data)-1)] <- diff(data[, i])
}
data[nrow(data),] <- NA
return(data)
}