我正在尝试在TensorFlow中构建一个单词级语言模型。我的输入是具有形状(batch_size, seq_length)
的单词id的批次,我的目标是向左移动一步的输入(因此对于每个单词,目标是序列中的下一个单词)。
模型接收单词嵌入作为输入(使用gensim word2vec预先训练单词嵌入)。我手动检查了嵌入字是否正确读取,并且它们对应于正确的单词id。
虽然我尝试了很多东西,但我的模型并没有改进。即使在完整训练集上训练100个时期,准确性仍然相同。
我尝试过的(没有任何成功):
一开始,损失和准确性都在提高。此外,该模型正在调整其预测。但是,在整个训练集的一些时期之后,损失和准确性保持不变。此外,模型预测不再发生变化而且会被卡住。 下面是一个示例,显示了相同输入序列的损耗和精度的发展。在30纪元之后,没有任何变化了:
2017-11-08 06:59:24,298 - DEBUG - Targets: [ 91 4 9 116 237 1953 240 3 2 1 0 2 1 9 144 351 29 299 24 453]
2017-11-08 06:59:24,299 - DEBUG - Predicted sequence: [0 0 0 0 0 0 0 0 2 1 0 0 1 0 0 0 0 0 0 0]
2017-11-08 06:59:24,299 - INFO - Current epoch: 1
2017-11-08 06:59:24,299 - INFO - Current training step: 2000
2017-11-08 06:59:24,299 - INFO - Current loss: 107.67147064208984
2017-11-08 06:59:24,299 - INFO - Current accuracy: 0.1599999964237213
2017-11-08 07:04:09,559 - DEBUG - Targets: [ 91 4 9 116 237 1953 240 3 2 1 0 2 1 9 144 351 29 299 24 453]
2017-11-08 07:04:09,560 - DEBUG - Predicted sequence: [ 4 4 6 6 16 0 0 3 2 1 9 2 1 0 0 4 0 0 4 8]
2017-11-08 07:04:09,560 - INFO - Current epoch: 5
2017-11-08 07:04:09,560 - INFO - Current training step: 2000
2017-11-08 07:04:09,560 - INFO - Current loss: 97.8116455078125
2017-11-08 07:04:09,560 - INFO - Current accuracy: 0.2150000035762787
2017-11-08 07:43:03,875 - DEBUG - Targets: [ 91 4 9 116 237 1953 240 3 2 1 0 2 1 9 144 351 29 299 24 453]
2017-11-08 07:43:03,875 - DEBUG - Predicted sequence: [ 6 4 9 55 47 0 5 3 2 1 9 2 1 0 55 24 0 0 3 6]
2017-11-08 07:43:03,876 - INFO - Current epoch: 30
2017-11-08 07:43:03,876 - INFO - Current training step: 2000
2017-11-08 07:43:03,876 - INFO - Current loss: 84.75357055664062
2017-11-08 07:43:03,876 - INFO - Current accuracy: 0.2549999952316284
我已经在这方面工作了一个星期,我不知道我能再试一试。我会非常感谢任何提示或想法。
代码的重要部分在这里:
def build_graph(self, graph):
with graph.as_default():
tf.set_random_seed(self.random_seed)
with tf.variable_scope('embedding'):
embedding_matrix = tf.get_variable(name='embedding_matrix', shape=self.embds.shape, initializer=tf.constant_initializer(self.embds), trainable=False)
with tf.name_scope('input'):
self.input_batch = tf.placeholder(tf.int64, shape=(None, self.seq_length))
self.inputs = tf.nn.embedding_lookup(embedding_matrix, self.input_batch)
self.label_batch = tf.placeholder(tf.int64, shape=(None, self.seq_length))
with tf.name_scope('rnn'):
# Set up the RNN architecture
cells = []
for i in range(self.n_layers):
cell = tf.contrib.rnn.LSTMCell(self.n_hidden, initializer=tf.contrib.layers.xavier_initializer())#use_peepholes=True,
# Add dropout (only used during training)
# cell = tf.contrib.rnn.DropoutWrapper(
# cell,
# output_keep_prob=(1.0 if not self.config['train'] else
# self.dropout_keep_prob))
cells.append(cell)
cell = tf.contrib.rnn.MultiRNNCell(
cells, state_is_tuple=True)
# Create a zero-filled state tensor as an initial state
self.init_state = cell.zero_state(self.batch_size, tf.float32)
# Create a recurrent neural network
output, self.final_state = tf.nn.dynamic_rnn(
cell,
inputs=self.inputs,
initial_state=self.init_state)
# OLD VERSION
# self.logits = tf.contrib.layers.fully_connected(outputs, self.vocab_size, activation_fn=None)
# NEW VERSION
# Try out part of tensorflow tutorial
self.output_flat = tf.reshape(output, [-1, cell.output_size])
softmax_w = tf.get_variable("softmax_w", [self.n_hidden, self.vocab_size], dtype=tf.float32)
softmax_b = tf.get_variable("softmax_b", [self.vocab_size], dtype=tf.float32)
logits = tf.nn.xw_plus_b(self.output_flat, softmax_w, softmax_b)
# Reshape logits to be a 3-D tensor for sequence loss
self.logits = tf.reshape(logits, [self.batch_size, self.seq_length, self.vocab_size])
# Use the contrib sequence loss and average over the batches
loss = tf.contrib.seq2seq.sequence_loss(
self.logits,
self.label_batch,
tf.ones([self.batch_size, self.seq_length], dtype=tf.float32),
average_across_timesteps=False, average_across_batch=True)
self.loss = tf.reduce_sum(loss)
with tf.name_scope('prediction'):
# Compute real-valued predictions of the network
self.predictions = tf.argmax(self.logits, axis=2)
# Compute the softmax
# softmax_ce = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.label_batch, logits=self.logits)
#with tf.name_scope("loss"):
# Compute the loss (cross-entropy)
# self.loss = tf.reduce_mean(softmax_ce)
with tf.name_scope("metrics"):
# Compute accuracy and perplexity for evaluation
correct_predictions = tf.to_float(tf.equal(self.label_batch, self.predictions))
self.perplexity = tf.reduce_mean(tf.exp(softmax_ce))
self.accuracy = tf.reduce_mean(correct_predictions)
with tf.name_scope('train'):
# Create a global step variable
self.global_step = tf.Variable(
0,
trainable=False,
name="global_step",
collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ])
# Get all variables created with trainable=True
parameters = tf.trainable_variables()
# Compute the gradient of the loss w.r.t to the params
gradients = tf.gradients(self.loss, parameters)
# Clip the gradients. How this works: Given a tensor t, and a maximum
# clip value clip_norm the op normalizes t so that its L2-norm is less
# than or equal to clip_norm
clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.clip_norm)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, epsilon=0.1)
# Apply the optimizer
self.train_step = self.optimizer.apply_gradients(zip(clipped_gradients, parameters), global_step=self.global_step)
# If not clipping the gradients, minimize the loss directly
# self.train_step = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
# self.train_step = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
self._create_summaries()
return graph
def train(self, save_every=20):
with self.graph.as_default():
# Initialize the state of the network
feed2 = np.zeros((self.batch_size, self.n_hidden))
t = tuple((feed2, feed2))
_current_state = np.array([t, t])
training_step = 0
for epoch_id in range(0, self.n_epochs):
m, n = self.x_train.shape
self.n_batches = int(m//self.batch_size)
for batch_number in range(0, self.n_batches):
training_step += 1
from_index = batch_number*self.batch_size
to_index = (batch_number+1)*self.batch_size
_inputs = self.x_train[from_index:to_index,:]
_labels = self.y_train[from_index:to_index,:]
# Run training step
# The final state of the net is fed back into the net
_logits, _predictions, _train_step, _current_state, _loss, _acc, summary = self.sess.run(
[self.logits,
self.predictions,
self.train_step,
self.final_state,
self.loss,
self.accuracy,
#self.perplexity,
self.merged],
feed_dict={
self.input_batch: _inputs,
self.label_batch: _labels,
self.init_state[0][0]: _current_state[0][0],
self.init_state[0][1]: _current_state[0][1],
self.init_state[1][0]: _current_state[1][0],
self.init_state[1][1]: _current_state[1][1],
})
pred = _predictions[0]
if batch_number % 2000 == 0:
self.sw.add_summary(summary, training_step)
tf.logging.debug("Targets: {}".format(_labels[0]))
tf.logging.debug("Predicted sequence: {}".format(pred))
tf.logging.info("Current epoch: {}".format(epoch_id))
tf.logging.info("Current training step: {}".format(batch_number))
tf.logging.info("Current loss: {}".format(_loss))
tf.logging.info("Current accuracy: {}".format(_acc))
tf.logging.info("Current perplexity: {}".format(_perpl))
self.save(epoch_id)