我正在尝试生成序列以馈入喀拉拉邦的lstm。
下面显示了示例数据和数据生成器的代码(请注意,该函数中的打印语句是为了帮助从jupyter Notebook进行调试。
下面的代码片段可以复制/粘贴到jupyter中,以重现该错误。
下面是生成测试数据的代码
#Generating test data to check the data generator.
import pandas as pd
import numpy as np
a=np.arange(1,50)
b=np.arange(1,50)
c=np.arange(1,50)
y=np.random.randint(2, size=49)
data={'a':a,'b':b,'c':c,'y':y}
data=pd.DataFrame.from_dict(data)
colnames=list(data)[0:data.shape[1]-1]
col_y='y'
生成器本身的代码如下。我添加了打印语句以帮助调试。
import numpy as np
import keras
class DataGenerator(keras.utils.Sequence):
def __init__(self, data_x,data_y, batch_size, lead, n_timesteps, n_features, #dimensions are (samples,timesteps,features)
n_classes,shuffle=False):
'Initialization'
self.dim = n_features
self.batch_size = batch_size
self.lead=lead
self.n_classes=n_classes
self.labels = data_y
self.predictor= data_x
self.start_idx=0
self.end_idx=len(data_x)
self.n_timesteps = n_timesteps
self.n_features = n_features
self.shuffle = shuffle
self.n_batches=0
self.batch_num=1
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.ceil(len(self.labels) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
idx = self.indexes[self.batch_num]
print("batch is",self.batch_num)
print("idx is",idx)
indexes=np.arange(idx-self.n_timesteps,idx+self.batch_size, step=1)
print("indexes are", indexes)
x_temp=self.predictor[indexes,]
y_temp=self.labels[indexes]
# Generate data
X, y = self.__data_generation(x_temp,y_temp)
self.batch_num+=1
print("X is",X)
print("y is", y)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.batch_num=1
self.indexes = np.arange(self.start_idx,self.end_idx-self.batch_size,step=self.batch_size)
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, x_temp,y_temp):
'Generates data containing batch_size samples' # X : (n_samples, n_timesteps,n_features)
# Initialization
X = np.empty([self.batch_size, self.n_timesteps, self.n_features])
y = np.empty((self.batch_size), dtype=int)
# Generate data
for rows in range(self.batch_size):
# Store sample
X[rows]=np.expand_dims(x_temp[rows:rows+self.n_timesteps,:][::-1], axis=0)
# Store class
y[rows] = y_temp[rows+self.n_timesteps]
return X, keras.utils.to_categorical(y, num_classes=self.n_classes)
最后,我使用下面的代码来设置测试/训练数据。
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.regularizers import l1, l2
#from my_classes import DataGenerator
n_timesteps=10
n_features=3
# Parameters
params_train = {
'batch_size': 10,
'n_classes': 2,
'n_timesteps': n_timesteps,
'n_features': n_features,
'lead': 0,
'shuffle': False,
}
params_val={
'batch_size': 10,
'n_classes': 2,
'n_timesteps': n_timesteps,
'n_features': n_features,
'lead': 0,
'shuffle': False,
}
# Datasets
train_x=data.iloc[0:100,0:data.shape[1]-1].values
train_y=data.iloc[0:100,data.shape[1]-1].values
val_x=data.iloc[101:149,0:data.shape[1]-1].values
val_y=data.iloc[101:149,data.shape[1]-1].values
最后是一个示例网络。...
# Generators
training_generator = DataGenerator(train_x,train_y, **params_train)
validation_generator = DataGenerator(val_x,val_y, **params_val)
# Design model
model = Sequential()
model.add(LSTM(n_features,return_sequences=True,input_shape = (n_timesteps,n_features)))
model.add(Dropout(0.4))
model.add(LSTM(n_features,input_shape = (n_timesteps,n_features)))
model.add(Dense(n_features,activation='relu'))
model.add(Dense(2))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['categorical_accuracy'])
# Train model on dataset
model.fit_generator(generator=training_generator,
use_multiprocessing=False,
workers=1, epochs=1)
我将它限制为一个工作人员,并且不进行任何多进程处理,因此我可以轻松进行调试,请随时进行更改...
使用此设置,我得到的错误是
StopIteration: index 4 is out of bounds for axis 0 with size 4
似乎可以正常使用,直到生成第39-30个序列的第3批为止。 从那以后,它会出错,并且不会获取49-40系列数据。
是什么原因引起的思考? 谢谢!