我正在尝试在PennTree Bank上训练基本的单向LSTM RNN语言模型。我的神经网络运行,但测试集的损失根本没有减少。我想知道为什么会这样?
网络参数:
V = 10000
batch_size = 20
hidden_size = 650
embed_size = hidden_size
num_unrollings = 35
max_epoch = 6
learning_rate = 1.0
图表定义:
graph = tf.Graph()
with graph.as_default():
cell_state = tf.placeholder(tf.float32, shape=(batch_size, hidden_size), name="CellState")
hidden_state = tf.placeholder(tf.float32, shape=(batch_size, hidden_size), name="HiddenState")
curr_batch = tf.placeholder(tf.int32, shape=[num_unrollings + 1, batch_size])
lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size)
embeddings = tf.Variable(tf.truncated_normal([V, embed_size], -0.1, 0.1), trainable=True, dtype=tf.float32)
W = tf.Variable(tf.truncated_normal([hidden_size, V], -0.1, 0.1))
b = tf.Variable(tf.zeros(V))
inputs = curr_batch[:num_unrollings,:] # num_unrollings x batch_size
labels = curr_batch[1:, :] # num_unrollings x batch_size
input_list = list()
for t in range(num_unrollings):
emb = tf.nn.embedding_lookup(embeddings, inputs[t,:])
input_list.append(emb)
outputs, states = tf.nn.static_rnn(lstm, input_list, initial_state=[cell_state, hidden_state]) # outputs: num_unrollings x batch_size x hidden
cell_state, hidden_state = states
outputs_flat = tf.reshape(outputs, [-1, lstm.output_size]) # output_flat: (num_unrollings x batch_size) x hidden
logits = tf.nn.softmax(tf.matmul(outputs_flat, W) + b) # logits_tensor: (num_unrollings x batch_size) x V
logits_tensor = tf.reshape(logits, [batch_size, num_unrollings, V])
targets = tf.transpose(labels) # targets: batch_size x num_unrollings
weights = tf.ones([batch_size, num_unrollings]) # weights: batch_size x num_unrollings
loss = tf.reduce_sum(tf.contrib.seq2seq.sequence_loss(logits_tensor, targets, weights, average_across_timesteps=False, average_across_batch=True))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
会话:
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
cstate = np.zeros([batch_size, hidden_size]).astype(np.float32)
hstate = np.zeros([batch_size, hidden_size]).astype(np.float32)
for epoch in range(max_epoch):
CURSOR_train = 0
epoch_over = False
steps = 0
average_loss = 0.0
while not epoch_over:
new_batch, epoch_over = nextBatch()
feed_data = {curr_batch: new_batch, "CellState:0": cstate, "HiddenState:0": hstate}
_, l, new_cell_state, new_hidden_state = session.run([optimizer, loss, cell_state, hidden_state], feed_dict=feed_data)
cstate = new_cell_state
hstate = new_hidden_state
average_loss += l
PRINT_INTERVAL = 200
if steps % PRINT_INTERVAL == 0:
print("Avg loss for last {0} batches: {1}".format(PRINT_INTERVAL, average_loss / PRINT_INTERVAL))
average_loss = 0
TEST_INTERVAL = 600
if steps % TEST_INTERVAL == 0:
# Evaluate the model
test_over = False
test_loss = 0.0
test_batch_num = 0
print("Testing ... ")
while not test_over:
test_batch_num += 1
test_batch, test_over = nextBatch(setup='test')
feed_data_test = { curr_batch: test_batch, "CellState:0": cstate, "HiddenState:0": hstate }
tl, d1, d2 = session.run([loss, cell_state, hidden_state], feed_dict=feed_data_test)
test_loss += tl
test_loss = test_loss / test_batch_num
print("Avg loss on test set: {0}".format(test_loss))
steps += 1
sys.stdout.write('\rStep: {0}'.format(steps))
测试组的损失总是320.2430792614422,无论我训练多久。训练集的损失确实发生了变化。 谢谢你!
答案 0 :(得分:0)
您的学习率过高,请尝试0.0005
的学习率并调整此数字。