我正在使用TensorFlow编写GRU模型。然而,有时在训练一些时间后,计算的损失变为纳米。问题是它并不总是发生,所以我很难调试。我在代码中没有0/0或log0操作。所以我猜是因为梯度操作?有人能给我一些建议吗?
我的GRU代码:
class GRU(object):
def __init__(self, input_dim, hidden_dim, period):
self.period = period
self.graph = tf.Graph()
with self.graph.as_default():
# Parameters:
Uz = tf.Variable(tf.random_uniform([hidden_dim, input_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
Ur = tf.Variable(tf.random_uniform([hidden_dim, input_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
U_ = tf.Variable(tf.random_uniform([hidden_dim, input_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
Wz = tf.Variable(tf.random_uniform([hidden_dim, hidden_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
Wr = tf.Variable(tf.random_uniform([hidden_dim, hidden_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
W_ = tf.Variable(tf.random_uniform([hidden_dim, hidden_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
bz = tf.Variable(tf.zeros([hidden_dim, 1]))
br = tf.Variable(tf.zeros([hidden_dim, 1]))
b_ = tf.Variable(tf.zeros([hidden_dim, 1]))
self.V = tf.Variable(tf.random_uniform([1, hidden_dim], -np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim)))
self.c = tf.Variable(tf.zeros([1, 1]))
# Definition of the cell computation.
def gru_cell(x_t, h_t_prev):
z = tf.sigmoid(tf.matmul(Uz, x_t) + tf.matmul(Wz, h_t_prev) + bz)
r = tf.sigmoid(tf.matmul(Ur, x_t) + tf.matmul(Wr, h_t_prev) + br)
h_ = tf.tanh(tf.matmul(U_, x_t) + tf.matmul(W_, r * h_t_prev) + b_)
h = tf.multiply((1 - z), h_) + tf.multiply(z, h_t_prev)
output = tf.tanh(tf.matmul(self.V, h) + self.c)[0][0]
return output, h
# Input data.
self.train_data = list()
for _ in range(period + 1):
self.train_data.append(
tf.placeholder(tf.float32, shape=[input_dim, 1]))
train_inputs = self.train_data[:period]
train_labels = self.train_data[period]
# Unrolled GRU loop.
outputs = list()
state = tf.Variable(tf.zeros([hidden_dim, 1]), trainable=False)
for i in train_inputs:
output, state = gru_cell(i, state)
outputs.append(output)
self.logits = outputs
self.loss = tf.sqrt(tf.squared_difference(self.logits[-1], train_labels[-1][0]))
# Optimizer.
self.learning_rate = tf.placeholder(tf.float32, shape=[])
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
# optimizer = tf.train.RMSPropOptimizer(self.learning_rate, epsilon=1e-6)
gradients, v = zip(*optimizer.compute_gradients(self.loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
self.optimizer = optimizer.apply_gradients(
zip(gradients, v))
# Predictions.
self.train_prediction = self.logits[-1]