我有一个以前使用过的多元CNN模型。这样就将7个要素作为输入(X),将1个要素(y)的输出预测作为目标。
我的下一步就是简单地使这个已经工作的模型成为多步骤。我想预测未来的10个步骤。
我一直在关注本教程:How to Develop Convolutional Neural Network Models for Time Series Forecasting
在尝试遵循本教程之后,我的问题是,预测不是基于预期的90个基本步骤(与历史步骤相同)加上另外10个步骤,甚至不是基于该教程可能合理的另外10个步骤。由于某种原因,我的预测将输出90个时间步长x10个不同的预测/功能。至少,这就是我在最后打印变量 prediction 时的样子。
一段时间以来,我一直对代码视而不见。我是出于我的目的使用了错误的教程,还是错过了一些东西?
注意:是的,split_sequence()方法略有不同,但这是因为与本教程不同,我不需要从集合中删除一个功能(输出)。
提前谢谢!
如果您宁愿有机会直接复制和/或阅读我的Google collab文档的副本,请执行以下操作: Google Collab copy
如果您想编辑,请pm电子邮件地址。我宁可不允许公开编辑。
否则,这是我的代码:
# FILE MANAGEMENT (get .csv from drive)
# GENERAL
import numpy as np
from numpy import hstack, array
from numpy import asarray
import pandas as pd
# Visualizations
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, describe
# KERAS
%tensorflow_version 1.x # Fixes deprecation warnings
from keras.utils import plot_model # Converts a Keras model to dot format and save to a file
# from keras.models import Model
# from keras.layers import Input # used to instantiate a Keras tensor
from keras.layers import Dense # Create layers
from keras.layers import Flatten # Flattens tensor to vector
from keras.layers.convolutional import Conv1D # D convolution
from keras.layers.convolutional import MaxPooling1D # Max pooling operation for X data
from keras.layers import Dropout
# from keras.layers.merge import concatenate
from keras.models import Sequential
from keras.callbacks import EarlyStopping, TerminateOnNaN
from sklearn import preprocessing
#from sklearn.preprocessing import MinMaxScaler
# Fix for the 'Run after'-bug caused by Colab
CLEAN_RUN = True
# IMPORT & NORMALIZE DATA
URL = 'https://raw.githubusercontent.com/victordahl/dataset/master/MSFT_10yrs.csv'
#URL = 'https://raw.githubusercontent.com/victordahl/dataset/master/MSFT_10yrs_open.csv'
csvfile = pd.read_csv(URL)
#print("cvs type", type(csvfile))
#print(csvfile.iloc[:, 1:])
min_max_scaler = preprocessing.MinMaxScaler()
# Normalize last five columns (excluding date in first columns)
x_scaled = min_max_scaler.fit_transform(csvfile.iloc[:, 1:])
df = pd.DataFrame(x_scaled)
# Rename all columns
df.columns = ["Open","High","Low","Close","Volume"]
# Derived features
short_SMA = 50
long_SMA = 200
df["50SMA"] = df["Open"].rolling(window=short_SMA).mean()
df["200SMA"] = df["Open"].rolling(window=long_SMA).mean()
# Trim head of dataset to remove the NaN days in SMA feature head
df = df.iloc[long_SMA:len(df)]
# describe and visualize data
print(df.iloc[:, 0:4].describe(),"\n")
print(df.iloc[:, 4:].describe(),"\n")
#tail = int(input("Print tail:"))
tail=2517
fig, ax1 = plt.subplots()
plt.title("$MSFT 10yrs")
plt.xlabel("Date")
df_columns = []
for i in df.columns:
if i != "Volume":
print("plotting {}...".format(i))
plt.plot(df[i].tail(tail), label=i)
ax1.legend(loc=2)
elif i == "Volume":
# Put 'Volume' on second Y axis for better plot
print("plotting {}...".format(i))
ax2 = ax1.twinx()
ax2.plot(df[i].tail(tail), label=i)
ax2.legend(loc=1)
# DATASET
SET_SIZE = 2000
raw_train_set = df.iloc[0:SET_SIZE]
raw_test_set = df.iloc[SET_SIZE:len(df)]
assert(len(raw_train_set)==SET_SIZE), "Something wrong with dataset separation..."
def split_seq(sequences, NUM_STEPS_IN, NUM_STEPS_OUT, return_y=False):
""" Split a multivariate sequence into samples.
sequences(array): one feature
NUM_STEPS_IN ∫: number of steps/days in input.
NUM_STEPS_OUT ∫: number of steps/days in output.
return_y (bool=False): When true returns only Y, default False returns only X
OUTPUT: X OR y (array): feature with steps
"""
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + NUM_STEPS_IN
out_end_ix = end_ix + NUM_STEPS_OUT-1
# check if we are beyond the dataset
if out_end_ix > len(sequences):
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix-1:out_end_ix, -1]
X.append(seq_x)
y.append(seq_y)
if return_y: # Y value for feature to be predicted
#print("seq_y", seq_y)
return array(y)
elif not return_y: # X value for all features
#print("seq_x", seq_x)
return array(X)
def generate_features_n(NUM_STEPS_IN, NUM_STEPS_OUT, feat):
""" Generate features for multivariate data
Args: features (Dataframe): Contains N number of features
Output: X, y (np.array): X (input) and y (output) sequences
"""
# DEFINE individual input sequence as array
open_seq = array(feat["Open"])
high_seq = array(feat["High"])
low_seq = array(feat["Low"])
close_seq = array(feat["Close"])
vol_seq = array(feat["Volume"])
ssma_seq = array(feat["50SMA"])
lsma_seq = array(feat["200SMA"])
# RESHAPE to colums
open_seq = open_seq.reshape((len(open_seq), 1))
high_seq = high_seq.reshape((len(high_seq), 1))
low_seq = low_seq.reshape((len(low_seq), 1))
close_seq = close_seq.reshape((len(close_seq), 1))
vol_seq = vol_seq.reshape((len(vol_seq), 1))
ssma_seq = ssma_seq.reshape((len(ssma_seq), 1))
lsma_seq = lsma_seq.reshape((len(lsma_seq), 1))
# HORISONTALLY STACK columns
dataset = hstack((open_seq, high_seq, low_seq, close_seq, vol_seq, ssma_seq, lsma_seq))
# SPLIT SEQUENCES
X = split_seq(dataset, NUM_STEPS_IN, NUM_STEPS_OUT)
y = split_seq(close_seq, NUM_STEPS_IN, NUM_STEPS_OUT, return_y=True)
return X, y
# FEATURE GEN
NUM_STEPS_IN = 90
NUM_STEPS_OUT = 10
# TRAINING SET
train_set_X, train_set_y = generate_features_n(NUM_STEPS_IN, NUM_STEPS_OUT, raw_train_set)
# PREDICTION SET
test_set_X, test_set_y = generate_features_n(NUM_STEPS_IN, NUM_STEPS_OUT, raw_test_set )
# Assert that there are no NaN values
assert(not np.isnan(np.sum(train_set_X))), "NaN values found in input!"
assert(not np.isnan(np.sum(train_set_y))), "NaN values found in input!"
assert(not np.isnan(np.sum(test_set_X))), "NaN values found in input!"
# MODEL
NUM_FILTERS = 32 #10 # <---------- TUNE ME
KERNEL_SIZE = 3 #2 # <---------- TUNE ME
UNITS_PER_DENSE_LAYER = 15 #10 # this should be AT LEAST 10 and LESS THAN 16! ) <---------- TUNE ME
# Features of train_set_X as shape =(samples, steps, features)
print("train shape", train_set_X.shape)
n_features = train_set_X.shape[2] # Must be '2'! Otherwise incorrect format of train_set_X
print("######################")
print("features", n_features)
# ------------------------------------------------------------------------------------------------------------------- CONV 1
model = Sequential()
model.add(Conv1D(
filters = NUM_FILTERS,
kernel_size = KERNEL_SIZE,
activation = 'sigmoid',
input_shape = (NUM_STEPS_IN, n_features)
)
)
model.add(MaxPooling1D(pool_size=3)) # <-------------- Decrease before next convid layer?
# IF MORE THAN 1 Conv1D layer add DROPOUT LAYER
"""model.add(Dropout(0.5)) #<-----------------------------------------Add later"""
# add conv
# add Maxpool
# add conv
# add Maxpool
model.add(Flatten())
model.add(Dense(UNITS_PER_DENSE_LAYER, activation = 'sigmoid'))
model.add(Dense(NUM_STEPS_OUT))
"""
look for CNN Time series prediction, not nec. stock m. prediction
more layers = more params = model needs more data
"""
model.compile(
optimizer = 'adam',
loss = 'mse',
metrics = ['accuracy','mean_squared_error']
)
print(model.summary())
plot_model(model)
# TRAIN
callbacks = [
EarlyStopping(
# Stop training when `val_loss` is no longer improving
monitor='val_loss',
# "no longer improving" being defined as "no better than 1e-2 less"
min_delta=1e-2,
# "no longer improving" defined as "for at least 2 epochs"
patience=10,
verbose=1
#restore_best_weights = True # This only seems to make it worse!
),
TerminateOnNaN() # Stops training if loss = NaN
]
print(n_features)
if CLEAN_RUN:
# Only allows training if training has not been run
history = model.fit(
train_set_X.reshape((len(train_set_X), NUM_STEPS_IN, n_features)),
train_set_y,
validation_split = 0.2,
epochs = 150, #150 <---------- TUNE ME (early stop callback)
batch_size = 16, #16 <---------- TUNE ME
verbose = 1,
callbacks = callbacks
)
CLEAN_RUN = False # Prevents double-training
elif not CLEAN_RUN:
print("Session is not CLEAN_RUN. Use Colab 'Run all'!")
print("\n___MODEL LOSS___")
print("Final MSE: ", history.history['mean_squared_error'][-1])
plt.plot(history.history['mean_squared_error']) #previously 'loss'
plt.plot(history.history['val_mean_squared_error']) #previously 'val_loss'
plt.title('Model loss')
plt.ylabel('Loss value')
plt.xlabel('Epoch')
plt.legend(['Training', 'Keras.validation_split (verify training using x%)'], loc='best')
plt.show()
# Predict
print("Making prediction...")
x_input = test_set_X#.reshape((1, NUM_STEPS_IN, n_features))
prediction = model.predict(x_input, verbose=1)
#print(type(prediction))
print("Pred. shape:", prediction.shape) # (228, 1)
assert(type(prediction) == type(array([]))) # Prediction should be np.array
# -------------------------------------------------------------------- I don't think tail and prediction NEED to be same shape...?
tail = df.iloc[:, :-6].tail(len(prediction))
tail = tail.to_numpy()
"""print("TAIL:\n", tail.shape)
print("PREDICTION:\n", prediction.shape)
assert(tail.shape == prediction.shape)
#nrm_scale_df = array(tail).reshape(-1, 1)"""
# Reset scaler to accepting the shape (228, 1)
set_new_scaled = min_max_scaler.fit_transform(csvfile.iloc[:, [1]])
# Denormalize
prediction = min_max_scaler.inverse_transform(prediction)
historic = min_max_scaler.inverse_transform(tail)
print("historic", historic)
with np.printoptions(threshold=np.inf):
print("prediction", prediction)
# This percentage comparizon isn't too relevant when we're predicting more days
# ahead. View the eventual graph instead.
"""compare_df = pd.DataFrame({
'Prediction': prediction[0],
'Historic': historic[0]
})
print(compare_df, "\nSimilarity:",round(*(prediction[0]/historic[0])*100,2),"%")"""
#print("\nPrediction:")
#print(prediction)
print("\nShapes match:", prediction.flatten().shape==historic.flatten().shape)
#print(prediction.reshape(len(prediction)).shape)
print("Prediction shape:", prediction.flatten().shape)
print("Historic data shape:", historic.flatten().shape)
# Plot
print("___Short view___")
print("Historic:", historic.shape)
print("Prediction:", prediction.shape)
print("Historic:", historic)
print("Prediction:", prediction)
# print(historic.describe())
# Data to be plotted
plt.plot(historic, label='Historic close')
plt.plot(prediction, label='Predicted close')
plt.draw()
plt.title("$MSFT prediction")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.legend()
plt.draw()
#pr = pearsonr(historic.flatten(), prediction.flatten())
print("Pearson R:", round(pr,2))
print("")
# print(compare_df.describe())