所以我是机器学习的新手,我在大学里获得了一个奖金课程,在那里我必须训练一个lstm模型来生成字幕。到目前为止,我已经阅读了以下内容:Blogpost_about_lstms
并以此作为参考:some_random_code
所以我要实现: 我有一个结构如下的数据集: CNN的输出,其Vector的大小为2048,其中包含图像的某些“特征”。还有5张说明该图片的标题。
培训: 输入:CNN向量+字幕 输出:字幕(猜测)
验证: 输入:CNN向量 输出:字幕(猜测)
那么我该如何使用2个输入(CNN数据和字幕序列)来训练仅从CNN输入向量生成新的字幕!
这有点棘手,我无法掌握其中的理论。而且Tensorflow也是我不得不说的一句话。
我有一个正常的Seq_2_Seq模型可以正常工作。但是现在我陷入困境:/
class Model(object):
def __init__(self, _input, is_training, hidden_size, vocab_size, num_layers,
dropout=config.trainer.dropout, init_scale=config.trainer.init_scale):
self.is_training = is_training
self.input_obj = _input
self.batch_size = _input.batch_size
self.num_steps = _input.num_steps
self.hidden_size = hidden_size
# create the word embeddings
with tf.device("/cpu:0"):
randomized = tf.random_uniform([vocab_size, hidden_size], -init_scale, init_scale)
print("randomized: ", randomized)
embedding = tf.Variable(randomized)
inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)
if is_training and dropout < 1:
inputs = tf.nn.dropout(inputs, dropout)
# set up the state storage / extraction
self.init_state = tf.placeholder(tf.float32, [num_layers, 2, self.batch_size, hidden_size])
state_per_layer_list = tf.unstack(self.init_state, axis=0)
rnn_tuple_state = tuple([tf.contrib.rnn.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])for idx in range(num_layers)])
# create an LSTM cell to be unrolled
print("Hidden size: ", hidden_size)
cell = tf.contrib.rnn.LSTMCell(hidden_size, forget_bias=config.trainer.forget_bias)
# add a dropout wrapper if training
if is_training and dropout < 1:
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
if num_layers > 1:
cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)], state_is_tuple=True)
print("input: ", inputs)
output, self.state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, initial_state=rnn_tuple_state)
# reshape to (batch_size * num_steps, hidden_size)
output = tf.reshape(output, [-1, hidden_size])
softmax_w = tf.Variable(tf.random_uniform([hidden_size, vocab_size], -init_scale, init_scale))
softmax_b = tf.Variable(tf.random_uniform([vocab_size], -init_scale, init_scale))
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
# Reshape logits to be a 3-D tensor for sequence loss
logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
# Use the contrib sequence loss and average over the batches
loss = tf.contrib.seq2seq.sequence_loss(logits,
self.input_obj.targets,
tf.ones([self.batch_size, self.num_steps], dtype=tf.float32),
average_across_timesteps=False,
average_across_batch=True)
# Update the cost
self.cost = tf.reduce_sum(loss)
# get the prediction accuracy
self.softmax_out = tf.nn.softmax(tf.reshape(logits, [-1, vocab_size]))
self.predict = tf.cast(tf.argmax(self.softmax_out, axis=1), tf.int32)
correct_prediction = tf.equal(self.predict, tf.reshape(self.input_obj.targets, [-1]))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
if not is_training:
return
self.learning_rate = tf.Variable(0.01, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5)
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
self.train_op = optimizer.apply_gradients(zip(grads, tvars),
global_step=tf.contrib.framework.get_or_create_global_step())
self.new_lr = tf.placeholder(tf.float32, shape=[])
self.lr_update = tf.assign(self.learning_rate, self.new_lr)
def assign_lr(self, session, lr_value):
session.run(self.lr_update, feed_dict={self.new_lr: lr_value})
我不需要解决方案,但是对前进的一些解释太棒了!