Question

我正在尝试实现某种pointing network。由于指向层的输出没有固定的维度，我不得不实现某种输出的屏蔽，我得出了这个：

class Attention(object):
    """Attention mechanism implementation."""

    def __init__(self, attention_states, attention_size, mask=None):
        """Initializes a new instance of the Attention class."""
        self._states = attention_states
        self._attention_size = attention_size
        self._batch = tf.shape(self._states)[0]
        self._length = tf.shape(self._states)[1]
        self._size = self._states.get_shape()[2].value  # statically defined
        self._mask = mask
        self._features = None

    def _init_features(self):
        states = tf.reshape(
            self._states, [self._batch, self._length, 1, self._size])
        weights = tf.get_variable(
            "kernel", [1, 1, self._size, self._attention_size])
        self._features = tf.nn.conv2d(states, weights, [1, 1, 1, 1], "SAME")

    def get_weights(self, query, scope=None):
        """Reurns the attention weights for the given query."""
        with tf.variable_scope(scope or "Attention"):
            if self._features is None:
                self._init_features()
            else:
                tf.get_variable_scope().reuse_variables()
            vect = tf.get_variable("Vector", [self._attention_size])
            with tf.variable_scope("Query"):
                query_features = linear(query, self._attention_size, False)
                query_features = tf.reshape(
                    query_features, [-1, 1, 1, self._attention_size])

        activations = vect * tf.tanh(self._features + query_features)
        activations = tf.reduce_sum(activations, [2, 3])
        exp_acts = tf.exp(activations)

        # NOTE: what follows is a custom implementation of
        # something equivalent to a sparse softmax.
        if self._mask is not None:
            exp_acts = exp_acts * self._mask
        sum_exp_acts = tf.expand_dims(tf.reduce_sum(exp_acts, axis=-1), 1)
        weights = tf.div(exp_acts, sum_exp_acts)
        return weights

经过一些训练后，我发现NaN张量weights theano。我无法弄清楚为什么这种情况发生了，但谷歌搜索，我来到this discussion about something similar可能会发生void remove_vowel(char strings[NUM_STRINGS][STRING_LENGTH])。可能是由于softmax实现中的某些饱和导致我必须模仿才能不考虑零吗？

提前致谢， Giulio的

用于指向网络的自定义稀疏softmax中的NaN

0 个答案: