我正在使用theano
来最小化一些具有随机梯度下降的函数
import numpy
import theano
import theano.tensor as T
class LogisticRegression(object):
def __init__(self, input, n_in, n_out):
self.W = theano.shared(value=numpy.zeros((n_in, n_out),
dtype=theano.config.floatX),
name='W', borrow=True)
self.b = theano.shared(value=numpy.zeros((n_out,),
dtype=theano.config.floatX),
name='b', borrow=True)
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
self.params = [self.W, self.b]
def negative_log_likelihood(self, y):
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]
import numpy as np
import scipy as sp
from scipy.io import loadmat
import theano
import theano.tensor as T
from scipy import signal
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression as LR
# Load dataset
data = loadmat('sp1s_aa')
x = data['x_train']
y = np.array(data['y_train'], dtype=int)
y=y.transpose()
train_indexes,test_indexes = cross_validation.train_test_split(np.arange(y.size), test_size=0.2,random_state=0)
x_train=x[:,:,train_indexes]
y_train=y[train_indexes]
# Band-pass filter signal
samp_rate = 100.
(b, a) = signal.butter(5, np.array([8., 30.]) / (samp_rate / 2.), 'band')
x_train_filt = signal.filtfilt(b, a, x_train, axis=0)
def csp(x_train_filt, y_train):
"""Calculate Common Spatial Patterns Decompostion and Returns
spatial filters W"""
# Calculate correlation matrices
X0 = x_train_filt[:,:,y_train[:,0]==0]
X1 = x_train_filt[:,:,y_train[:,0]==1]
C0 = 0.
for i in xrange( X0.shape[2] ):
C0 = C0 + np.dot(X0[:,:,i].transpose() , X0[:,:,i])
C0 = C0/X0.shape[2]
C1 = 0.
for i in xrange( X1.shape[2] ):
C1 = C1+np.dot(X1[:,:,i].transpose(), X1[:,:,i])
C1 = C1/X1.shape[2]
# Calculate CSP
D, V = sp.linalg.eig(C1, C1+C0);
ind = sorted(range(D.size), key=lambda k: D[k])
V = V[:,ind];
W = np.hstack([V[:,0:2], V[:,25:]]);
return W
W = csp(x_train_filt, y_train)
V = np.ones((50,1))
#sc = classify_csp(W, V, x_train_filt, y_train, x_test_filt, y_test)
# Fine tune CSP pipeline
# Note input data dim: [batches, time, channel]
# Filter dim: [channel_in, channel_out]
from logistic_sgd import LogisticRegression
x_train_filt_T = theano.shared(x_train_filt.transpose(2, 0, 1))
y_train_T = T.cast( theano.shared(y_train[:,0]), 'int32')
lr = .01 # learning rate
batch_size = y_train.size/4
epochs = 2500
index = T.lscalar('index')
y = T.ivector('y')
X = T.tensor3('X')
csp_w = theano.shared(W)
avg_v = theano.shared(V)
proj_csp = T.tensordot(X,csp_w,axes=[2,0])
layer0_out = T.pow(proj_csp, 2)
variance = T.tensordot(layer0_out, avg_v, axes=[1,0])
layer1_out = T.log((variance))[:,:,0]
layer2 = LogisticRegression(input=layer1_out, n_in=5, n_out=2)
cost = layer2.negative_log_likelihood(y)+.01*T.sum(T.pow(avg_v,2))
params = [csp_w, avg_v] + layer2.params
grads = T.grad(cost,params)
updates = []
for param_i, grad_i in zip(params,grads):
updates.append((param_i, param_i - lr*grad_i))
train_model = theano.function([index], cost, updates=updates,
givens={
X: x_train_filt_T[index * batch_size: (index + 1) * batch_size],
y: y_train_T[index * batch_size: (index + 1) * batch_size]})
for i in range(epochs):
for j in range(y_train.size/batch_size):
cost_ij = train_model(j)
print 'Epoch = %i' % i
print 'Cost = %f' % cost_ij
W
和V
- 一些numpy矩阵。
所以,经过几千次迭代,我得到了cost_ij成为NaN。然后我检查了params
,发现在最后一个梯度下降步骤之后,这个矩阵中的每个值都变成了NaN。并且在此矩阵的先前步骤值不是那么小或大。我相信这param_i - lr*grad_i
行的麻烦。那么,我如何在此步骤(或之前的)梯度下降中检查grad_i
或grads[0]
数值?