R中的时间序列预测与插入符号

时间:2016-09-27 10:07:40

标签: r machine-learning time-series r-caret predict

我正在尝试构建一个预测模型,该模型使用插入符包来预测变量pop与经济数据集的对数差异。

分析

library(dplyr)
library(caret)

my_data <- data.frame(economics)

# make the Y variable "pop" into a leading variable so I am predicting the future value
new_data <- cbind(my_data, lead(my_data$pop, n = 1))

# renaming the leading pop into Y and the other pop into pop_lag
names(new_data)[7] <- "Y"
names(new_data)[3] <- "pop_lag"

# date into rownames
new_data <- rownames_date(new_data)

# lagging variables to use as prediction variables.
new_data <- do_lag(new_data, names(new_data)[-c(2,6)])
new_data <- na.omit(new_data)

# take the logdiff of all the time series
new_data <- logdiff_func(new_data)
new_data <- na.omit(new_data)

# create training and testing sets
trainIndex <- 1:round((0.7*nrow(new_data)),1)

new_train <- new_data[ trainIndex,]
new_test  <- new_data[-trainIndex,]

# train a randomforest model on the training data
rf_model <- train(Y ~ ., data = new_train, method = 'rf',
                 trControl = trainControl(method = 'timeslice',
                                          initialWindow = 50,
                                          horizon = 15,
                                          fixedWindow = TRUE),
                                          importance = TRUE,
              tuneGrid = data.frame(mtry = 1:(ncol(new_data)-1)))

# make predictions on testing data
rf.fit <- predict(rf_model, new_test)

# plotting results
tick.dates <- c('1970-01-01', '1980-01-01', '1990-01-01', '2000-01-01', '2010-01-01')
rf.pred <- data.frame(Month = rownames(new_data), actual = new_data$Y,
                       predicted = c(rep(NA, dim(new_data)[1] - length(rf.fit)), 
                                     rf.fit))
ggplot(rf.pred, aes(x = Month, y = actual)) + geom_point(alpha = .7) + 
  geom_point(aes(y = predicted), color = 'red', alpha = .7, shape = 3) +
  scale_x_discrete(breaks = tick.dates) + 
  labs(title = 'Time-sliced random forest actual (black) and predicted (red)',
       x = 'Month', y = 'logdiff Unemployment')
plot(rf_model)

# MSE and RMSE
MSE(new_test$Y,rf.fit)
sqrt(MSE(new_test$Y,rf.fit))

这是训练模型和预测的正确方法吗?所有的预测是否领先一步预测?以下是我使用的功能:

功能

do_lag <- function(data, variables, num_periods = 1){

  num_vars <- length(variables)
  num_rows <- nrow(data)

  for(j in 1:num_vars){
    for(i in 1:num_periods){
      data[[paste0(variables[j], "_lag")]] <- c(rep(NA, i), head(data[[variables[j]]], num_rows - i))
    }
  }
  return(data)
}

MSE <- function(testY, fit) {
  error <- testY - fit
  return(sum(error**2) / length(error))
}

rownames_date <- function(data){

  for(i in 1:ncol(data)){
    if(names(data)[i] == "Date" || names(data)[i] == "date"){
      rownames(data) <- data[,i]
      data[,i] <- NULL
      return(data)
    }
  } 
}


logdiff_func <- function(data){
  data <- na.omit(data)
  i <- 1
  for(i in 1:length(data)){
    data[, i][1:(nrow(data))] <- log(data[, i])
  }

  i <- 1
  for(i in 1:length(data)){
    data[, i][1:(nrow(data)-1)] <- diff(data[, i])
  }
  data[nrow(data),] <- NA
  return(data)
}

0 个答案:

没有答案