梯度下降失败,所有参数均为NaN值

时间:2016-01-18 11:34:39

标签: python numpy gradient theano gradient-descent

我正在使用theano来最小化一些具有随机梯度下降的函数

import numpy

import theano
import theano.tensor as T


class LogisticRegression(object):



    def __init__(self, input, n_in, n_out):

        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
                                             dtype=theano.config.floatX),
                                name='W', borrow=True)

        self.b = theano.shared(value=numpy.zeros((n_out,),
                                             dtype=theano.config.floatX),
                                name='b', borrow=True)


        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = [self.W, self.b]

    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]

import numpy as np
import scipy as sp
from scipy.io import loadmat
import theano
import theano.tensor as T
from scipy import signal
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression as LR


# Load dataset
data = loadmat('sp1s_aa')
x = data['x_train']
y = np.array(data['y_train'], dtype=int)
y=y.transpose()
train_indexes,test_indexes = cross_validation.train_test_split(np.arange(y.size), test_size=0.2,random_state=0)
x_train=x[:,:,train_indexes]
y_train=y[train_indexes]


# Band-pass filter signal
samp_rate = 100.
(b, a) = signal.butter(5, np.array([8., 30.]) / (samp_rate / 2.), 'band')
x_train_filt = signal.filtfilt(b, a, x_train, axis=0)  

def csp(x_train_filt, y_train):
    """Calculate Common Spatial Patterns Decompostion and Returns
    spatial filters W"""

    # Calculate correlation matrices
    X0 = x_train_filt[:,:,y_train[:,0]==0]
    X1 = x_train_filt[:,:,y_train[:,0]==1]

    C0 = 0.
    for i in xrange( X0.shape[2] ):
        C0 = C0 + np.dot(X0[:,:,i].transpose() , X0[:,:,i])

    C0 = C0/X0.shape[2]

    C1 = 0.
    for i in xrange( X1.shape[2] ):
        C1 = C1+np.dot(X1[:,:,i].transpose(), X1[:,:,i])

    C1 = C1/X1.shape[2]

    # Calculate CSP
    D, V   = sp.linalg.eig(C1, C1+C0);
    ind = sorted(range(D.size), key=lambda k: D[k])
    V = V[:,ind];
    W = np.hstack([V[:,0:2], V[:,25:]]);

    return W


W = csp(x_train_filt, y_train)
V = np.ones((50,1))
#sc = classify_csp(W, V, x_train_filt, y_train, x_test_filt, y_test)

# Fine tune CSP pipeline
# Note input data dim: [batches, time, channel]
# Filter dim: [channel_in, channel_out]
from logistic_sgd import LogisticRegression

x_train_filt_T = theano.shared(x_train_filt.transpose(2, 0, 1))

y_train_T      = T.cast( theano.shared(y_train[:,0]), 'int32')


lr         = .01 # learning rate
batch_size = y_train.size/4
epochs     = 2500
index      = T.lscalar('index')
y          = T.ivector('y')
X          = T.tensor3('X')
csp_w      = theano.shared(W)
avg_v      = theano.shared(V)
proj_csp   = T.tensordot(X,csp_w,axes=[2,0])
layer0_out = T.pow(proj_csp, 2)
variance   = T.tensordot(layer0_out, avg_v, axes=[1,0])

layer1_out = T.log((variance))[:,:,0]
layer2     = LogisticRegression(input=layer1_out, n_in=5, n_out=2)
cost       = layer2.negative_log_likelihood(y)+.01*T.sum(T.pow(avg_v,2))

params  = [csp_w, avg_v] + layer2.params

grads   = T.grad(cost,params)
updates = []
for param_i, grad_i in zip(params,grads):
    updates.append((param_i, param_i - lr*grad_i))


train_model = theano.function([index], cost, updates=updates,
      givens={
          X: x_train_filt_T[index * batch_size: (index + 1) * batch_size],
          y: y_train_T[index * batch_size: (index + 1) * batch_size]})
for i in range(epochs):
    for j in range(y_train.size/batch_size):
        cost_ij = train_model(j)
    print 'Epoch = %i' % i
    print 'Cost = %f' % cost_ij

WV - 一些numpy矩阵。

所以,经过几千次迭代,我得到了cost_ij成为NaN。然后我检查了params,发现在最后一个梯度下降步骤之后,这个矩阵中的每个值都变成了NaN。并且在此矩阵的先前步骤值不是那么小或大。我相信这param_i - lr*grad_i行的麻烦。那么,我如何在此步骤(或之前的)梯度下降中检查grad_igrads[0]数值?

0 个答案:

没有答案