Question

我正在尝试通过遵循Standford的CS231n类（http://cs231n.github.io/assignments2018/assignment3/）的assignment3中的相同逻辑来构建图像字幕RNN。我之前已经完成了使用numpy构建RNN的assignment3。现在，我想构建相同的RNN，但是这次使用Tensorflow，以下是我的代码。但是，由于某种原因，我在训练中损失的钱并没有减少。因此，我认为我的代码中肯定存在一些逻辑和设置问题。

在培训期间，此RNN有两个输入，功能和字幕。功能是CNN的输出，标题是在每个时间点描述单词的矩阵。通过矩阵乘法，特征预计将成为初始输出h0。具有零矩阵的h0将形成lstm的初始状态。字幕（captions_in）将转换为单词嵌入矩阵，其中包含RNN所有时间点的输入。 lstm的输出将投影到一个分数矩阵，该分数矩阵用于预测图像的字幕。分数矩阵和captions_out矩阵用于评估损失。

import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np

tfe.enable_eager_execution()

class CaptioningRNN(object):
    def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128, hidden_dim=128):
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}
        self.hidden_dim = hidden_dim

        # initialize weights
        self.W_proj = tfe.Variable(np.random.randn(input_dim, hidden_dim)/np.sqrt(input_dim), dtype=tf.float32)
        self.b_proj = tfe.Variable(np.zeros(hidden_dim), dtype=tf.float32)
        self.W_embed = tfe.Variable(np.random.randn(len(word_to_idx), wordvec_dim), dtype=tf.float32)
        self.W_vocab = tfe.Variable(np.random.randn(hidden_dim, len(word_to_idx))/np.square(hidden_dim), dtype=tf.float32)
        self.b_vocab = tfe.Variable(np.zeros(len(word_to_idx)), dtype=tf.float32)

        # initialize lstm cell
        self.encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim)


    def forward(self, features, captions):
        outputs = []

        h0 = tf.matmul(features, self.W_proj) + self.b_proj
        x = tf.nn.embedding_lookup(self.W_embed, captions)
        timestep_x = tf.unstack(x, axis=1)

        c = tfe.Variable(np.zeros((features.shape[0], self.hidden_dim)), dtype=tf.float32)
        state = tf.nn.rnn_cell.LSTMStateTuple(c=c, h=h0)

        for input_steps in timestep_x:
            output, state = self.encoder_cell(input_steps, state)

            outputs.append(output)

        outputs = tf.stack(outputs, axis=1)

        reshaped_outputs = tf.reshape(outputs, [outputs.shape[0] * outputs.shape[1], outputs.shape[2]])
        reshaped_scores = tf.matmul(reshaped_outputs, self.W_vocab)
        scores = tf.reshape(reshaped_scores, [outputs.shape[0], outputs.shape[1], self.b_vocab.shape[0]]) + self.b_vocab

        return scores


    def cost(self, scores, labels):
        scores_flat = tf.reshape(scores, [scores.shape[0]*scores.shape[1], scores.shape[2]])
        labels_flat = tf.reshape(labels, [labels.shape[0] * labels.shape[1], 1])

        cost = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels_flat, logits=scores_flat)
        return tf.reduce_mean(cost)


def get_loss(model, features, captions):
    scores = model.forward(features, captions[:, :-1])
    cost = model.cost(scores, captions[:, 1:])
    return cost


N = 2     # batch_size
T = 3     # timesteps
D = 4     # input_dim
W = 5     # wordvec_dim
H = 10     # hidden_dim
word_to_idx = {'<NULL>': 0, 'cat': 2, 'dog': 3}
vocab_size = len(word_to_idx)

features = np.random.randn(N, D).astype(np.float32)
captions = np.random.randint(vocab_size, size=(N, T))

model = CaptioningRNN(word_to_idx, input_dim=D, wordvec_dim=W, hidden_dim=H)
# get_loss(model, features, captions)

optimizer = tf.train.AdamOptimizer(5e-3)

for i in range(5000):
    optimizer.minimize(lambda: get_loss(model, features, captions))

    if i % 500 == 0:
        ls = get_loss(model, features, captions)
        print("loss: {}".format(ls))

损失：2.4719245433807373
损失：2.4723522663116455
损失：2.472407579421997
损失：2.4723868370056152
损失：2.472378969192505
损失：2.4724044799804688
损失：2.472418785095215
损失：2.472426414489746
损失：2.472430944442749
损失：2.4724338054656982

无法使用Tensorflow构建适当的图像标题rnn

0 个答案: