RNN推理所需的解码器目标

时间:2018-11-18 06:42:15

标签: python tensorflow lstm rnn seq2seq

我一直在尝试使用deepfix工具(https://bitbucket.org/iiscseal/deepfix)进行一些实验,该工具是用于纠正常见编程错误的seq2seq模型。 我对代码进行了更改,使其与TF-1.12兼容,因为原始代码包含tensorflow.contrib.seq2seq版本(仅TF-1.12中不支持的TF-1.0.x函数)。

主要更改是在seq2seq_model中定义的neural_net/train.py中。 以下是更改后的代码。我是tensorflow RNN的新手,并使用在线代码的帮助对解码器部分进行了编码。

class seq2seq_model():

PAD = 0
EOS = 1

def __init__(self, vocab_size, embedding_size, max_output_seq_len,
             cell_type='LSTM', memory_dim=300, num_layers=4, dropout=0.2,
             attention=True,
             scope=None,
             verbose=False):

    assert 0 <= dropout and dropout <= 1, '0 <= dropout <= 1, you passed dropout={}'.format(
        dropout)

    tf.set_random_seed(1189)

    self.attention = attention
    self.max_output_seq_len = max_output_seq_len

    self.memory_dim = memory_dim
    self.num_layers = num_layers
    self.dropout = dropout
    self.scope = scope

    if dropout != 0:
        self.keep_prob = tf.placeholder(tf.float32)
    else:
        self.keep_prob = None

    self.vocab_size = vocab_size
    self.embedding_size = embedding_size

    self.encoder_cell = _new_RNN_cell(
        memory_dim, num_layers, cell_type, dropout, self.keep_prob)
    self.decoder_cell = _new_RNN_cell(
        memory_dim, num_layers, cell_type, dropout, self.keep_prob)

    self._make_graph()

    if self.scope is not None:
        saver_vars = [var for var in tf.global_variables(
        ) if var.name.startswith(self.scope)]
    else:
        saver_vars = tf.global_variables()

    if verbose:
        print 'root-scope:', self.scope
        print "\n\nDiscovered %d saver variables." % len(saver_vars)
        for each in saver_vars:
            print each.name

    self.saver = tf.train.Saver(saver_vars, max_to_keep=5)

@property
def decoder_hidden_units(self):
    return self.memory_dim

def _make_graph(self):
    self._init_placeholders()

    self._init_decoder_train_connectors()

    self._init_embeddings()

    self._init_simple_encoder()

    self._init_decoder()

    self._init_optimizer()

def _init_placeholders(self):
    """ Everything is time-major """
    self.encoder_inputs = tf.placeholder(
        shape=(None, None),
        dtype=tf.int32,
        name='encoder_inputs',
    )
    self.encoder_inputs_length = tf.placeholder(
        shape=(None,),
        dtype=tf.int32,
        name='encoder_inputs_length',
    )

    self.decoder_targets = tf.placeholder(
        shape=(None, None),
        dtype=tf.int32,
        name='decoder_targets'
    )
    self.decoder_targets_length = tf.placeholder(
        shape=(None,),
        dtype=tf.int32,
        name='decoder_targets_length',
    )

def _init_decoder_train_connectors(self):

    with tf.name_scope('decoderTrainFeeds'):
        sequence_size, batch_size = tf.unstack(
            tf.shape(self.decoder_targets), name='decoder_targets_shape')

        EOS_SLICE = tf.ones([1, batch_size], dtype=tf.int32) * self.EOS
        PAD_SLICE = tf.ones([1, batch_size], dtype=tf.int32) * self.PAD

        self.decoder_train_inputs = tf.concat(
            [EOS_SLICE, self.decoder_targets], axis=0, name="decoder_train_inputs")
        self.decoder_train_length = self.decoder_targets_length + 1

        decoder_train_targets = tf.concat(
            [self.decoder_targets, PAD_SLICE], axis=0)
        decoder_train_targets_seq_len, _ = tf.unstack(
            tf.shape(decoder_train_targets))
        decoder_train_targets_eos_mask = tf.one_hot(self.decoder_train_length - 1,
                                                    decoder_train_targets_seq_len,
                                                    on_value=self.EOS, off_value=self.PAD,
                                                    dtype=tf.int32)
        decoder_train_targets_eos_mask = tf.transpose(
            decoder_train_targets_eos_mask, [1, 0])

        decoder_train_targets = tf.add(decoder_train_targets,
                                       decoder_train_targets_eos_mask, name="decoder_train_targets")

        self.decoder_train_targets = decoder_train_targets

        self.loss_weights = tf.ones([
            batch_size,
            tf.reduce_max(self.decoder_train_length)
        ], dtype=tf.float32, name="loss_weights")

def _init_embeddings(self):
    with tf.variable_scope("embedding") as scope:
        sqrt3 = math.sqrt(3)
        initializer = tf.random_uniform_initializer(-sqrt3, sqrt3)

        self.embedding_matrix = tf.get_variable(
            name="embedding_matrix",
            shape=[self.vocab_size, self.embedding_size],
            initializer=initializer,
            dtype=tf.float32)

        self.encoder_inputs_embedded = tf.nn.embedding_lookup(
            self.embedding_matrix, self.encoder_inputs,
            name="encoder_inputs_embedded")

        self.decoder_train_inputs_embedded = tf.nn.embedding_lookup(
            self.embedding_matrix, self.decoder_train_inputs,
            name="decoder_train_inputs_embedded")

def _init_simple_encoder(self):
    with tf.variable_scope("Encoder") as scope:
        (self.encoder_outputs, self.encoder_state) = (
            tf.nn.dynamic_rnn(cell=self.encoder_cell,
                              inputs=self.encoder_inputs_embedded,
                              sequence_length=self.encoder_inputs_length,
                              time_major=True,
                              dtype=tf.float32)
        )

def _init_decoder(self):
    with tf.variable_scope("decoder") as scope:
        # def output_fn(outputs):
        #     return tf.contrib.layers.fully_connected(outputs, self.vocab_size, scope=scope,
        #                                                 name = "output_fn")

        sequence_size, batch_size = tf.unstack(
            tf.shape(self.decoder_targets), name='decoder_targets_shape')

        train_helper = seq2seq.TrainingHelper(
                inputs=self.decoder_train_inputs_embedded,
                sequence_length=self.decoder_train_length,
                time_major=True,
                name="train_helper")


        pred_helper = seq2seq.SampleEmbeddingHelper(
                embedding=self.embedding_matrix,
                start_tokens=tf.ones([batch_size], dtype=tf.int32) * self.EOS,
                end_token=self.EOS)
                # name="pred_helper")

        def _decode(helper, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                attention_states = tf.transpose(
                    self.encoder_outputs, [1, 0, 2])

                attention_mechanism = seq2seq.BahdanauAttention(
                num_units=self.decoder_hidden_units, memory=attention_states,
                name="attention_mechanism")

                attention_cell = seq2seq.AttentionWrapper(
                self.decoder_cell, attention_mechanism,
                name="atttention_wrapper")

                out_cell = tf.contrib.rnn.OutputProjectionWrapper(
                    attention_cell, self.vocab_size, reuse=reuse)
                    # name="output_cell")

                decoder = seq2seq.BasicDecoder(
                    cell=out_cell, helper=helper,
                    initial_state=out_cell.zero_state(
                        dtype=tf.float32, batch_size=batch_size))
                        # name="decoder")

                outputs = seq2seq.dynamic_decode(
                    decoder=decoder, output_time_major=True,
                    impute_finished=True)
                    # name="outputs")

                return outputs



        (self.decoder_logits_train, self.decoder_state_train, _) = _decode(train_helper, "decoder")
        (self.decoder_logits_inference, self.decoder_state_inference, _) = _decode(pred_helper, "decoder", reuse=True)

        self.decoder_logits_train = self.decoder_logits_train.rnn_output
        self.decoder_logits_inference = self.decoder_logits_inference.rnn_output
        # self.decoder_logits_train = output_fn(self.decoder_outputs_train)

        self.decoder_prediction_train = tf.argmax(
            self.decoder_logits_train, axis=-1, name='decoder_prediction_train')

        scope.reuse_variables()

        self.decoder_prediction_inference = tf.argmax(self.decoder_logits_inference, axis=-1,
                                                      name='decoder_prediction_inference')


def _init_optimizer(self):
    logits = tf.transpose(self.decoder_logits_train, [1, 0, 2])
    targets = tf.transpose(self.decoder_train_targets, [1, 0])
    self.loss = seq2seq.sequence_loss(logits=logits, targets=targets,
                                      weights=self.loss_weights)

    self.optimizer = tf.train.AdamOptimizer()
    gvs = self.optimizer.compute_gradients(self.loss)

    def ClipIfNotNone(grad):
        if grad is None:
            return grad
        return tf.clip_by_value(grad, -1., 1)

    # capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
    capped_gvs = [(ClipIfNotNone(grad), var) for grad, var in gvs]

    self.train_op = self.optimizer.apply_gradients(capped_gvs)

def make_feed_dict(self, x, x_len, y, y_len):
    feed_dict = {
        self.encoder_inputs: x,
        self.encoder_inputs_length: x_len,

        self.decoder_targets: y,
        self.decoder_targets_length: y_len,
    }

    if self.dropout != 0:
        feed_dict.update({self.keep_prob: 1.0 - self.dropout})

    return feed_dict

def load_parameters(self, sess, filename):
    self.saver.restore(sess, filename)

def save_parameters(self, sess, filename, global_step=None):
    self.saver.save(sess, filename, global_step=global_step)

def train_step(self, session, x, x_len, y, y_len):
    feed_dict = self.make_feed_dict(x, x_len, y, y_len)
    _, loss = session.run([self.train_op, self.loss], feed_dict)
    return loss

def validate_step(self, session, x, x_len, y, y_len):
    feed_dict = self.make_feed_dict(x, x_len, y, y_len)
    loss, decoder_prediction, decoder_train_targets = session.run([self.loss,
                                                                   self.decoder_prediction_inference,
                                                                   self.decoder_train_targets], feed_dict)
    return loss, np.array(decoder_prediction).T, np.array(decoder_train_targets).T

def sample(self, session, X, X_len):
    feed_dict = {self.encoder_inputs: X,
                 self.encoder_inputs_length: X_len}

    if self.dropout != 0:
        feed_dict.update({self.keep_prob: 1.0})

    decoder_prediction = session.run(
        self.decoder_prediction_inference, feed_dict)
    return np.array(decoder_prediction).T

此代码存在一些问题:

  1. 主要问题-seq2seq.train_step()seq2seq.validate_step()函数正在运行,但是当我使用seq2seq.sample()进行实际推断时,出现一个错误,要求我为{{ 1}}。这是意外情况,因为decoder_targets函数用于推理,不需要decode_targets。错误:
  

InvalidArgumentError(请参阅上面的回溯):您必须提供一个值   用于dtype int32的占位符张量'ids / decoder_targets'和   形状[?,?] [[节点ID / decoder_targets(在   ... / code / neural_net / train.py:241)= Placeholderdtype = DT_INT32,   shape = [?,?],   _device =“ / job:localhost /副本:0 /任务:0 /设备:CPU:0”]]

  1. 当我尝试使用SampleEmbeddingHelper而不是GreedyEmbeddingHelper,然后运行SampleEmbeddingHelper op时,机器挂起并在一段时间后耗尽了内存。尽管decoder_logits_inference可以正常工作。

1 个答案:

答案 0 :(得分:0)

  1. 好吧,SampleEmbeddingHelper确实需要解码器目标,因为它混合了GreedyEmbeddingHelper(推断模式)和tf.contrib.seq2seq.TrainingHelper(教师强制)的一部分。我认为您只需要使用GreedyEmbeddingHelper

  2. 从一开始,参数是完全随机的(如果未预先训练)。 也许您已经看到seq2seq模型的前几个循环的结果完全被弄乱了。 因此,如果您使用GreedyEmbeddingHelper,它根据前一个结果输出结果,并且当然没有人教它“在哪里停止”,那么它通常会无限循环直到您的内存用完。为了解决这个问题,您需要为tf.contrib.seq2seq.dynamic_decode中的句子长度设置一个上限。 参数为maximum_iterations。如图所示 tf.contrib.seq2seq.dynamic_decode