Question

我目前正在尝试用Lasagne构建一个LSTM网络，以预测噪声序列的下一步。我首先训练了一堆2个LSTM层暂时，但由于分歧问题（最终产生NaN值），不得不使用极低的学习率（1e-6）。结果有点令人失望，因为网络产生了平滑，异相的输入版本。

然后我得出结论我应该使用比默认情况下更好的参数初始化。目标是从一个模仿身份的网络开始，因为对于强自相关信号，它应该是下一步（x（t）~x（t + 1））的良好的第一次估计，并且有点洒在它上面的噪音。

import theano, numpy, lasagne
from theano import tensor as T
from lasagne.layers.recurrent import LSTMLayer, InputLayer, Gate
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import sigmoid, tanh, leaky_rectify
from lasagne.layers import get_output
from lasagne.init import GlorotNormal, Normal, Constant

floatX = 'float32'

# function to create a lstm that ~ propagate the input from start to finish off the bat
# should be a good start for a predictive lstm with high one-step autocorrelation
def create_identity_lstm(input, shape, orig_inp=None, noiselvl=0.01, G=10., mask_input=None):
    inp, out = shape
    # orig_inp is used to limit the number of units that are actually used to pass the input information from one layer to the other - the rest of the units should produce ~ 0 activation. 
    if orig_inp is None:
        orig_inp = inp
    # input gate
    inputgate = Gate(
                 W_in=GlorotNormal(noiselvl),
                 W_hid=GlorotNormal(noiselvl),
                 W_cell=Normal(noiselvl),
                 b=Constant(0.),
                 nonlinearity=sigmoid
                 )
    # forget gate
    forgetgate = Gate(
                 W_in=GlorotNormal(noiselvl),
                 W_hid=GlorotNormal(noiselvl),
                 W_cell=Normal(noiselvl),
                 b=Constant(0.),
                 nonlinearity=sigmoid
                 )
    # cell gate
    cell = Gate(
                 W_in=GlorotNormal(noiselvl),
                 W_hid=GlorotNormal(noiselvl),
                 W_cell=None,
                 b=Constant(0.),
                 nonlinearity=leaky_rectify
                 )
    # output gate
    outputgate = Gate(
                 W_in=GlorotNormal(noiselvl),
                 W_hid=GlorotNormal(noiselvl),
                 W_cell=Normal(noiselvl),
                 b=Constant(0.),
                 nonlinearity=sigmoid
                 )
    lstm = LSTMLayer(input, out, ingate=inputgate, forgetgate=forgetgate, cell=cell, outgate=outputgate, nonlinearity=leaky_rectify, mask_input=mask_input)
    # change matrices and biases
    # ingate - should return ~1 (matrices = 0, big bias)
    b_i = lstm.b_ingate.get_value()
    b_i[:orig_inp] += G
    lstm.b_ingate.set_value(b_i)
    # forgetgate - should return 0 (matrices = 0, big negative bias)
    b_f = lstm.b_forgetgate.get_value()
    b_f[:orig_inp] -= G
    b_f[orig_inp:] += G # to help learning future features, I preserve a large bias on "unused" units to help it remember stuff 
    lstm.b_forgetgate.set_value(b_f)
    # cell - should return x(t) (W_xc = identity, rest is 0)
    W_xc = lstm.W_in_to_cell.get_value()
    for i in xrange(orig_inp):
        W_xc[i, i] += 1.
    lstm.W_in_to_cell.set_value(W_xc)
    # outgate - should return 1 (same as ingate)
    b_o = lstm.b_outgate.get_value()
    b_o[:orig_inp] += G
    lstm.b_outgate.set_value(b_o)
    # done
    return lstm

然后我使用这个lstm生成代码生成以下网络：

# layers
#input + dropout
input = InputLayer((None, None, 7), name='input')
mask = InputLayer((None, None), name='mask')
drop1 = DropoutLayer(input, p=0.33)
#lstm1 + dropout
lstm1 = create_identity_lstm(drop1, (7, 1024), mask_input=mask)
drop2 = DropoutLayer(lstm1, p=0.33)
#lstm2 + dropout
lstm2 = create_identity_lstm(drop2, (1024, 128), orig_inp=7, mask_input=mask)
drop3 = DropoutLayer(lstm2, p=0.33)    
#lstm3
lstm3 = create_identity_lstm(drop3, (128, 7), orig_inp=7, mask_input=mask)

# symbolic variables and prediction
x = input.input_var
ma = mask.input_var
ma_reshape = ma.dimshuffle((0,1,'x'))
yhat = get_output(lstm3, deterministic=False)
yhat_det = get_output(lstm3, deterministic=True)
y = T.ftensor3('y')
predict = theano.function([x, ma], yhat_det)

问题是，即使没有经过任何培训，这个网络也会从第一个LSTM层产生垃圾值，有时甚至会产生一堆NaN：

X = numpy.random.random((5, 10000, 7)).astype('float32')
Masks = numpy.ones(X.shape[:2], dtype='float32')
hid1 = get_output(lstm1, determistic=True)
get_hid1 = theano.function([x, ma], hid1)
h1 = get_hid1(X, Masks)
print numpy.isnan(h1).sum(axis=1).sum(axis=1) 
    array([6379520, 6367232, 6377472, 6376448, 6378496])
# even the first output value is garbage!
print h1[:,0,0] - X[:,0,0]
    array([-0.03898358, -0.10118812,  0.34877831, -0.02509735,  0.36689138], dtype=float32)

我不明白为什么，我检查了每个矩阵，它们的值很好，就像我希望它们一样。我甚至尝试使用实际的numpy数组重新创建每个门激活和产生的隐藏激活，并且它们重现输入就好了。那我做错了什么？

LSTMLayer甚至在训练之前就会生成NaN值

0 个答案: