我正在尝试实现某种pointing network。由于指向层的输出没有固定的维度,我不得不实现某种输出的屏蔽,我得出了这个:
class Attention(object):
"""Attention mechanism implementation."""
def __init__(self, attention_states, attention_size, mask=None):
"""Initializes a new instance of the Attention class."""
self._states = attention_states
self._attention_size = attention_size
self._batch = tf.shape(self._states)[0]
self._length = tf.shape(self._states)[1]
self._size = self._states.get_shape()[2].value # statically defined
self._mask = mask
self._features = None
def _init_features(self):
states = tf.reshape(
self._states, [self._batch, self._length, 1, self._size])
weights = tf.get_variable(
"kernel", [1, 1, self._size, self._attention_size])
self._features = tf.nn.conv2d(states, weights, [1, 1, 1, 1], "SAME")
def get_weights(self, query, scope=None):
"""Reurns the attention weights for the given query."""
with tf.variable_scope(scope or "Attention"):
if self._features is None:
self._init_features()
else:
tf.get_variable_scope().reuse_variables()
vect = tf.get_variable("Vector", [self._attention_size])
with tf.variable_scope("Query"):
query_features = linear(query, self._attention_size, False)
query_features = tf.reshape(
query_features, [-1, 1, 1, self._attention_size])
activations = vect * tf.tanh(self._features + query_features)
activations = tf.reduce_sum(activations, [2, 3])
exp_acts = tf.exp(activations)
# NOTE: what follows is a custom implementation of
# something equivalent to a sparse softmax.
if self._mask is not None:
exp_acts = exp_acts * self._mask
sum_exp_acts = tf.expand_dims(tf.reduce_sum(exp_acts, axis=-1), 1)
weights = tf.div(exp_acts, sum_exp_acts)
return weights
经过一些训练后,我发现NaN
张量weights
theano
。我无法弄清楚为什么这种情况发生了,但谷歌搜索,我来到this discussion about something similar可能会发生void remove_vowel(char strings[NUM_STRINGS][STRING_LENGTH])
。可能是由于softmax实现中的某些饱和导致我必须模仿才能不考虑零吗?
提前致谢, Giulio的