我正在尝试通过遵循Standford的CS231n类(http://cs231n.github.io/assignments2018/assignment3/)的assignment3中的相同逻辑来构建图像字幕RNN。我之前已经完成了使用numpy构建RNN的assignment3。现在,我想构建相同的RNN,但是这次使用Tensorflow,以下是我的代码。但是,由于某种原因,我在训练中损失的钱并没有减少。因此,我认为我的代码中肯定存在一些逻辑和设置问题。
在培训期间,此RNN有两个输入,功能和字幕。功能是CNN的输出,标题是在每个时间点描述单词的矩阵。通过矩阵乘法,特征预计将成为初始输出h0。具有零矩阵的h0将形成lstm的初始状态。字幕(captions_in)将转换为单词嵌入矩阵,其中包含RNN所有时间点的输入。 lstm的输出将投影到一个分数矩阵,该分数矩阵用于预测图像的字幕。分数矩阵和captions_out矩阵用于评估损失。
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
tfe.enable_eager_execution()
class CaptioningRNN(object):
def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128, hidden_dim=128):
self.word_to_idx = word_to_idx
self.idx_to_word = {i: w for w, i in word_to_idx.items()}
self.hidden_dim = hidden_dim
# initialize weights
self.W_proj = tfe.Variable(np.random.randn(input_dim, hidden_dim)/np.sqrt(input_dim), dtype=tf.float32)
self.b_proj = tfe.Variable(np.zeros(hidden_dim), dtype=tf.float32)
self.W_embed = tfe.Variable(np.random.randn(len(word_to_idx), wordvec_dim), dtype=tf.float32)
self.W_vocab = tfe.Variable(np.random.randn(hidden_dim, len(word_to_idx))/np.square(hidden_dim), dtype=tf.float32)
self.b_vocab = tfe.Variable(np.zeros(len(word_to_idx)), dtype=tf.float32)
# initialize lstm cell
self.encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim)
def forward(self, features, captions):
outputs = []
h0 = tf.matmul(features, self.W_proj) + self.b_proj
x = tf.nn.embedding_lookup(self.W_embed, captions)
timestep_x = tf.unstack(x, axis=1)
c = tfe.Variable(np.zeros((features.shape[0], self.hidden_dim)), dtype=tf.float32)
state = tf.nn.rnn_cell.LSTMStateTuple(c=c, h=h0)
for input_steps in timestep_x:
output, state = self.encoder_cell(input_steps, state)
outputs.append(output)
outputs = tf.stack(outputs, axis=1)
reshaped_outputs = tf.reshape(outputs, [outputs.shape[0] * outputs.shape[1], outputs.shape[2]])
reshaped_scores = tf.matmul(reshaped_outputs, self.W_vocab)
scores = tf.reshape(reshaped_scores, [outputs.shape[0], outputs.shape[1], self.b_vocab.shape[0]]) + self.b_vocab
return scores
def cost(self, scores, labels):
scores_flat = tf.reshape(scores, [scores.shape[0]*scores.shape[1], scores.shape[2]])
labels_flat = tf.reshape(labels, [labels.shape[0] * labels.shape[1], 1])
cost = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels_flat, logits=scores_flat)
return tf.reduce_mean(cost)
def get_loss(model, features, captions):
scores = model.forward(features, captions[:, :-1])
cost = model.cost(scores, captions[:, 1:])
return cost
N = 2 # batch_size
T = 3 # timesteps
D = 4 # input_dim
W = 5 # wordvec_dim
H = 10 # hidden_dim
word_to_idx = {'<NULL>': 0, 'cat': 2, 'dog': 3}
vocab_size = len(word_to_idx)
features = np.random.randn(N, D).astype(np.float32)
captions = np.random.randint(vocab_size, size=(N, T))
model = CaptioningRNN(word_to_idx, input_dim=D, wordvec_dim=W, hidden_dim=H)
# get_loss(model, features, captions)
optimizer = tf.train.AdamOptimizer(5e-3)
for i in range(5000):
optimizer.minimize(lambda: get_loss(model, features, captions))
if i % 500 == 0:
ls = get_loss(model, features, captions)
print("loss: {}".format(ls))
损失:2.4719245433807373
损失:2.4723522663116455
损失:2.472407579421997
损失:2.4723868370056152
损失:2.472378969192505
损失:2.4724044799804688
损失:2.472418785095215
损失:2.472426414489746
损失:2.472430944442749
损失:2.4724338054656982