我刚刚阅读train_test_split
,并且意识到由于使用基于历史记录的功能def create_dataset()
,因此我在打印当前图时由于当前的预处理设置而无意中泄漏了数据。我看到的是总共24 + 6 = 30 time_stamps,而不是32 + 8 = 40!在这种分裂情况下,基于历史的预测方法是否合理?如果没有,我该如何解决?
Y_train= data_train[index]
data_train = data_train.values
print("data_train size: {}".format(Y_train.shape))
data_train size: (40, 960)
from sklearn.model_selection import train_test_split
def create_dataHistory(dataset,data_train,look_back=1):
dataX,dataY = [],[]
print("Len:",len(dataset)-look_back-1)
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), :]
dataX.append(a)
dataY.append(data_train[i + look_back, :])
return np.array(dataX), np.array(dataY)
look_back = 10
trainX,trainY = create_dataHistory(data_train,Y_train, look_back=look_back)
trainX, testX, trainY, testY = train_test_split(trainX,trainY, test_size=0.2)
print("train size: {}".format(trainX.shape))
print("train Label size: {}".format(trainY.shape))
print("test size: {}".format(testX.shape))
print("test Label size: {}".format(testY.shape))
Len: 29
train size: (23, 10, 1440)
train Label size: (23, 960)
test size: (6, 10, 1440)
test Label size: (6, 960)