Question

我最近在Keras的深度学习模型上工作，这让我感到非常困惑。该模型能够随着时间的推移掌握训练数据，但它一直在验证数据上得到更差的结果。

我知道如果验证准确度上升了一段时间然后开始减少，那么你就过度拟合了训练数据，但在这种情况下，验证准确度只会下降。我真的很困惑为什么会这样。有没有人对可能导致这种情况发生的原因有任何直觉？或者有任何关于要测试的东西的建议可能会修复它？

修改以添加更多信息和代码

确定。所以我正在制作一个试图做一些基本股票预测的模型。通过观察过去40天的开盘价，最高价，最低价，收盘价和成交量，该模型试图预测价格是否会上涨两个平均真实范围而不会下降到一个平均真实范围。作为输入，我从雅虎财经中获取了CSV，其中包括过去30年来道琼斯工业平均指数中所有股票的信息。该模型对70％的股票进行了培训，并在另外20％的股票上进行了验证。这导致大约150,000个训练样本。我目前正在使用1d卷积神经网络，但我也尝试过其他较小的模型（逻辑回归和小前馈NN），我总是得到相同的不同的列车和验证损失或根本没有学到任何东西，因为模型太简单了

以下是代码：

import numpy as np
from sklearn import preprocessing
from sklearn.metrics import auc, roc_curve, roc_auc_score
from keras.layers import Input, Dense, Flatten, Conv1D, Activation, MaxPooling1D, Dropout, Concatenate
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import backend as K
import matplotlib.pyplot as plt
from random import seed, shuffle
from os import listdir

class roc_auc(Callback):
    def on_train_begin(self, logs={}):
        self.aucs = []

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.validation_data[0])
        self.aucs.append(roc_auc_score(self.validation_data[1], y_pred))
        if max(self.aucs) == self.aucs[-1]:
            model.save_weights("weights.roc_auc.hdf5")
        print(" - auc: %0.4f" % self.aucs[-1])
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

rrr = 2
epochs = 200
batch_size = 64
days_input = 40
seed(42)
X_train = []
X_test = []
y_train = []
y_test = []

files = listdir("Stocks")
total_stocks = len(files)
shuffle(files)

for x, file in enumerate(files):
    test = False
    if (x+1.0)/total_stocks > 0.7:
        test = True
    if test:
        print("Test -> Stocks/%s" % file)
    else:
        print("Train -> Stocks/%s" % file)
    stock = np.loadtxt(open("Stocks/"+file, "r"), delimiter=",", skiprows=1, usecols = (1,2,3,5,6))
    #stock indexes -> 0:open, 1:high, 2:low, 3:close, 4:Volume, 5:ATR(generated)
    atr = []
    last = None

    for day in stock:
        if last is None:
            tr = abs(day[1] - day[2])
            atr.append(tr)
        else:
            tr = max(day[1] - day[2], abs(last[3] - day[1]), abs(last[3] - day[2]))
            atr.append((13*atr[-1]+tr)/14)
        last = day.copy()

    stock = np.insert(stock, 5, atr, axis=1)
    #print(stock)
    for i in range(days_input,stock.shape[0]-1):   
        input = stock[i-days_input:i, 0:5].copy()
        for j, day in enumerate(input):
            input[j][1] = (day[1]-day[0])/day[0]
            input[j][2] = (day[2]-day[0])/day[0]
            input[j][3] = (day[3]-day[0])/day[0]
        input[:,0] = input[:,0] / np.linalg.norm(input[:,0])
        input[:,1] = input[:,1] / np.linalg.norm(input[:,1])
        input[:,2] = input[:,2] / np.linalg.norm(input[:,2])
        input[:,3] = input[:,3] / np.linalg.norm(input[:,3])
        input[:,4] = input[:,4] / np.linalg.norm(input[:,4])
        preprocessing.scale(input, copy=False)
        output = -1
        buy = stock[i][1]
        stoploss = buy - stock[i][5]
        target = buy + rrr*stock[i][5]

        for j in range(i+1, stock.shape[0]):
            if stock[j][0] < stoploss or stock[j][2] < stoploss:
                output = 0
                break
            elif stock[j][1] > target:
                output = 1
                break

        if output != -1:
            if test:
                X_test.append(input)
                y_test.append(output)
            else:
                #if((output == 0 and np.average(y_train) > 0.5) or output == 1):
                X_train.append(input)
                y_train.append(output)

shape = list(X_train[0].shape)
shape[:0] = [len(X_train)]
X_train = np.concatenate(X_train).reshape(shape)
y_train = np.array(y_train)

#print("Training on %d samples" % X_train.shape[0])

shape = list(X_test[0].shape)
shape[:0] = [len(X_test)]
X_test = np.concatenate(X_test).reshape(shape)
y_test = np.array(y_test)

#print("Test on %d samples" % X_test.shape[0])

print("Train class split is %0.2f" % (100*np.average(y_train)))
print("Test class split is %0.2f" % (100*np.average(y_test)))

inputs = Input(shape=(days_input,5))

x = Conv1D(32, 5, padding='same')(inputs)
x = Activation('relu')(x)
x = MaxPooling1D()(x)
#x = Dropout(0.25)(x)

x = Conv1D(64, 5, padding='same')(x)
x = Activation('relu')(x)
x = MaxPooling1D()(x)
#x = Dropout(0.25)(x)

x = Conv1D(128, 5, padding='same')(x)
x = Activation('relu')(x)
x = MaxPooling1D()(x)
#x = Dropout(0.25)(x)

#x = Conv1D(128, 3, padding='same')(x)
#x = Activation('relu')(x)
#x = MaxPooling1D()(x)
#x = Dropout(0.25)(x)

#x = Concatenate()([x1,x2,x3,x4])

x = Flatten()(x)
#x = Dense(64, activation="relu")(x)
#x = Dropout(0.25)(x)
x = Dense(128, activation="relu")(x)
x = Dense(64, activation="relu")(x)
#x = Dropout(0.5)(x)
output = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inputs,outputs=output)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=True, mode='max')
auc_hist = roc_auc()
callbacks_list = [checkpoint, auc_hist]

history = model.fit(X_train, y_train, validation_data=(X_test,y_test) , epochs=epochs, callbacks=callbacks_list, batch_size=batch_size, class_weight ='balanced').history

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("weights.latest.hdf5")
model.load_weights("weights.roc_auc.hdf5")

plt.plot(history['acc'])
plt.plot(history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(auc_hist.aucs)
plt.title('model ROC AUC')
plt.ylabel('AUC')
plt.xlabel('epoch')
plt.show()

y_pred = model.predict(X_train)

fpr, tpr, _ = roc_curve(y_train, y_pred)
roc_auc = auc(fpr, tpr)

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Train ROC')
plt.legend(loc="lower right")

y_pred = model.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.subplot(1, 2, 2)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Test ROC')
plt.legend(loc="lower right")
plt.show()

#threshold that optimizes tp*winPercent/(fp*(1-winPercent)+tp*winPercent) on validation
#bestThresh = 0
#bestVal = 0
#for i in range(len(thresholds)):
#   value = 100*tpr[i]*winPercent/(fpr[i]*(1-winPercent)+tpr[i]*winPercent)
#   if value >= bestVal:
#       bestVal = value
#       bestThresh = thresholds[i]
#print("Best Threshold(%0.2f) has a win rate of %0.2f%%" % (bestThresh, bestVal))

#save to csv then use roc analyzer to find best
with open('roc.csv','w+') as file:
    for i in range(len(thresholds)):
        file.write("%f,%f,%f\n" % (fpr[i], tpr[i], thresholds[i]))

100个批次而不是纪元的结果

我听取了建议并做了一些更新。这些课程现在平衡50％至50％而不是25％至75％。此外，验证数据现在是随机选择的，而不是一组特定的股票。通过以更精细的分辨率（100批次与1个时期）绘制损失和准确度，可以清楚地看到过度拟合。该模型实际上在开始分歧之前就开始学习。我很惊讶它开始过度适应的速度有多快，但现在我可以看到问题，希望我可以调试它。

Answer 1

可能的解释

编码错误
由于培训/验证数据的差异而过度拟合
倾斜的课程（以及培训/验证数据的差异）

我会尝试的事情

交换培训和验证集。问题是否仍然存在？
为前10个时期更详细地绘制曲线（例如，在初始化之后直接绘制;每次训练迭代，不仅是每个时期）。你还是从＆gt;开始吗？ 75％？然后，您的课程可能会出现偏差，您可能还想检查您的培训验证分组是否分层。

代码

这没用：np.concatenate(X_train)
在此处发布时，使代码尽可能可读。这包括删除已注释掉的行。

这看起来对我的编码错误很可疑：

if test:
    X_test.append(input)
    y_test.append(output)
else:
    #if((output == 0 and np.average(y_train) > 0.5) or output == 1):
    X_train.append(input)
    y_train.append(output)

使用sklearn.model_selection.train_test_split代替。之前对数据进行所有转换，然后使用此方法进行拆分。

Answer 2

对于您拥有的训练样本数量来说，批次大小似乎太小了。尝试分批处理20％，看看有什么区别。

当火车和验证损失与时代1不同时，它意味着什么？

2 个答案:

可能的解释

我会尝试的事情

代码