我正在使用带有keras的张量流,并且试图用LSTM单元构建一个简单的两层RNN(用于语言建模,预测下一个单词)。我正在使用PTB数据集,并尝试从"Recurrent Neural Network Regularization实现网络(嵌入-> LSTM1-> LSTM2->密集)。不幸的是,我得到了一些奇怪的结果,第一个时期看起来很棒,损失从〜9下降到〜6.9(分类交叉熵),但是它永远停留在那里(即使我降低了学习速度)。 作为测试,我希望看到我可以使用类似的体系结构使小文本过拟合,但是得到的结果相同! 我使用的批量大小为20,每个示例中使用20个步骤。我100%知道我的batchGenerator函数...
是否可能过度拟合?我想念什么?
我的代码:
tempSet = batchGenerator(text = testSet[:10000], batchSize = batchSize,timeStepsPerBatch = timeStepsPerBatch,vocabSize = vocabSize)
model = Sequential()
model.add(Embedding(vocabSize, 200, input_length=20,batch_input_shape = (20,20)))
model.add(LSTM(200,return_sequences=True,stateful = True))
model.add(LSTM(200,return_sequences=True,stateful = True))
model.add(TimeDistributed(Dense(vocabSize)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam', metrics=['categorical_accuracy'])
# add reset call back after each epoch, add preplixty mdss
modelHistory = model.fit_generator(tempSet.generate(),
steps_per_epoch=tempSet.stepPerEpoch, epochs=100, verbose=1,
callbacks=[resetStateAtEpochEndCall],
validation_data=None, validation_steps=None)
结果: 时代1/100 24/24 [==============================]-11s 471ms / step-损耗:8.1857-categorical_accuracy:0.0422 时代2/100 24/24 [==============================]-4s 156ms / step-损耗:6.2512-categorical_accuracy:0.0536 时代3/100 24/24 [==============================]-4s 155ms / step-损耗:6.1297-categorical_accuracy:0.0504 时代4/100 24/24 [==============================]-4s 154ms / step-损耗:6.0974-categorical_accuracy:0.0497 时代5/100 24/24 [==============================]-4s 153ms / step-损耗:6.0877-categorical_accuracy:0.0497 时代6/100 24/24 [==============================]-4s 153ms / step-损耗:6.0828-categorical_accuracy:0.0503 时代7/100 24/24 [==============================]-4s 153ms / step-损耗:6.0800-categorical_accuracy:0.0505 时代8/100 24/24 [==============================]-4s 152ms / step-损耗:6.0781-categorical_accuracy:0.0505 时代9/100 24/24 [==============================]-4s 152ms / step-损耗:6.0766-categorical_accuracy:0.0506 时代10/100 24/24 [==============================]-4s 153ms / step-损耗:6.0755-categorical_accuracy:0.0506 。 。 。 24/24 [==============================]-4s 149ms / step-损耗:6.0477-categorical_accuracy:0.0504 时代98/100 24/24 [==============================]-4s 150ms / step-损耗:6.0470-categorical_accuracy:0.0501 时代99/100 24/24 [==============================]-4s 150ms / step-损耗:6.0483-categorical_accuracy:0.0498 时代100/100 24/24 [==============================]-4s 149ms / step-损耗:6.0471-categorical_accuracy:0.0498 >
class batchGenerator:
def __init__(self, text, batchSize = 20,timeStepsPerBatch = 35,vocabSize = 9999):
self.text = text
self.textLen = len(text)
self.batchSize = batchSize
self.segLen = self.textLen//self.batchSize # we'll divide the input text into segments,
# each segment will be used to generate a sample inside a mini-batch
# this way we can use the last hidden state of the RNN as an input to the next mini batch
self.cursor = [ind*self.segLen for ind in range(self.batchSize)]
self.timeStepsPerBatch = timeStepsPerBatch
self.vocabSize = vocabSize
self.stepPerEpoch = (self.segLen - 1)//self.timeStepsPerBatch
print('Number of steps per epoch',self.stepPerEpoch)
self.iterInd = 0;
# alert for misdividing the data
lostDataOutOfSeg = self.textLen - self.segLen*self.batchSize
if(lostDataOutOfSeg > 0):
print('Amount of words lost because text didnt dived nicely to %d segments is %d Which is %.2f%% from the total data'
%(self.batchSize,lostDataOutOfSeg,((lostDataOutOfSeg/self.textLen)*100)))
lostDataInSeg = (self.segLen -1 - ((self.segLen - 1)//self.timeStepsPerBatch)*self.timeStepsPerBatch)*self.batchSize
if(lostDataInSeg > 0):
print('Amount of words lost because segment didnt dived nicely to %d time steps is: %d Which is %.2f%% from the total data'
%(self.timeStepsPerBatch,lostDataInSeg,(lostDataInSeg/self.textLen)*100))
if(lostDataOutOfSeg + lostDataInSeg > 0):
print('Total lost data : %d which is %.2f%% from the total data'
%((lostDataOutOfSeg + lostDataInSeg),(lostDataOutOfSeg + lostDataInSeg)*100/self.textLen))
def generate(self):
# outputs a mini batch of data (x,y)
# x - tensor of length batchSize. Each entrance in x is a (partial) sentence (timeStepsPerBatch words) from a segment
# y - is the same as x, except it is shifted 1 word (targets)
while(True): # genrator
self.iterInd = self.iterInd + 1;
#print(self.iterInd)
x = np.zeros((self.batchSize,self.timeStepsPerBatch),dtype = np.int32)
y = np.zeros((self.batchSize,self.timeStepsPerBatch,self.vocabSize),dtype = np.int32)
# check if can take full timeStepsPerBatch at each segment
if(self.cursor[0] + self.timeStepsPerBatch + 1> self.segLen): #TODO: double check conidtion
# end of an epoch, reset cursors
#print('End of epoch, cursor reset. iter num ',self.iterInd)
self.cursor = [ind*self.segLen for ind in range(self.batchSize)]
for i in range(self.batchSize):
x[i,:] = self.text[self.cursor[i]:(self.cursor[i] + self.timeStepsPerBatch)]
y_id = self.text[(self.cursor[i]+1):(self.cursor[i] + self.timeStepsPerBatch+1)]
y[i,:,:] = tf.keras.utils.to_categorical(y_id,num_classes = self.vocabSize) # transform to 1-hot encoding
# update cursor
self.cursor[i] = self.cursor[i] + self.timeStepsPerBatch
yield x,y