为什么规范化输入的一个维度会给出不同的准确度?

时间:2015-06-24 18:20:07

标签: neural-network

以下是我改编自Stephen Welch的YouTube系列的神经网络代码。我想知道为什么当我改变所有X的值时,(标准化它)它似乎产生不同的结果。当它是X / 10时,预测精度为+ 90%,但当我以X / 4000为例时(X的最高值为3695),我只能获得~80%的准确度。这对我来说没有意义,因为用设定值改变所有X就像改变单位一样。它根本不应该对输出属性产生影响。

这种重大差异出现在哪里?我没有完全掌握数学背后的概念。

import numpy as np
from matplotlib.pyplot import *
from scipy import optimize
import random
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm

X = np.loadtxt('xdata.txt', dtype=float)
y = np.loadtxt('ydata.txt', dtype=float)
y = np.reshape(y, (148,1))

X = X/10
y = y/5

Lambda = 0.0003

class Neural_Network(object):
    def __init__(self, Lambda = 0):        
        #Define Hyperparameters
        self.inputLayerSize = 17
        self.outputLayerSize = 1
        self.hiddenLayerSize = 17

        #Weights (parameters)
        self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
        self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)

        self.Lambda = Lambda

    def forward(self, X):
        #Propogate inputs though network
        self.z2 = np.dot(X, self.W1)
        self.a2 = self.sigmoid(self.z2)
        self.z3 = np.dot(self.a2, self.W2)
        yHat = self.sigmoid(self.z3) 
        return yHat

    def sigmoid(self, z):
        #Apply sigmoid activation function to scalar, vector, or matrix
        return 1/(1+np.exp(-z))

    def sigmoidPrime(self,z):
        #Gradient of sigmoid
        return np.exp(-z)/((1+np.exp(-z))**2)

    def costFunction(self, X, y):
        #Compute cost for given X,y, use weights already stored in class.
        self.yHat = self.forward(X)
        J = 0.5*sum((y-self.yHat)**2)/X.shape[0] + (self.Lambda/2)*(sum(sum(self.W1**2)) + sum(self.W2**2))

        return J

    def costFunctionPrime(self, X, y):
        #Compute derivative with respect to W and W2 for a given X and y:
        self.yHat = self.forward(X)

        delta3 = np.multiply(-(y-self.yHat), self.sigmoidPrime(self.z3))
        dJdW2 = np.dot(self.a2.T, delta3)/X.shape[0] + self.Lambda*self.W2

        delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
        dJdW1 = np.dot(X.T, delta2)/X.shape[0] + self.Lambda*self.W1  

        return dJdW1, dJdW2

    #Helper Functions for interacting with other classes:
    def getParams(self):
        #Get W1 and W2 unrolled into vector:
        params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
        return params

    def setParams(self, params):
        #Set W1 and W2 using single paramater vector.
        W1_start = 0
        W1_end = self.hiddenLayerSize * self.inputLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize , self.hiddenLayerSize))
        W2_end = W1_end + self.hiddenLayerSize*self.outputLayerSize
        self.W2 = np.reshape(params[W1_end:W2_end], (self.hiddenLayerSize, self.outputLayerSize))

    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.costFunctionPrime(X, y)
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))

def computeNumericalGradient(N, X, y):
        paramsInitial = N.getParams()
        numgrad = np.zeros(paramsInitial.shape)
        perturb = np.zeros(paramsInitial.shape)
        e = 1e-4

        for p in range(len(paramsInitial)):
            #Set perturbation vector
            perturb[p] = e
            N.setParams(paramsInitial + perturb)
            loss2 = N.costFunction(X, y)

            N.setParams(paramsInitial - perturb)
            loss1 = N.costFunction(X, y)

            #Compute Numerical Gradient
            numgrad[p] = (loss2 - loss1)/(2*e)

            #Return the value we changed to zero:
            perturb[p] = 0

        #Return Params to original value:
        N.setParams(paramsInitial)

        return numgrad

class trainer(object):
    def __init__(self, N):
        self.N = N

    def callbackF(self, params):
        self.N.setParams(params)
        self.J.append(self.N.costFunction(self.X, self.y))
        self.testJ.append(self.N.costFunction(self.testX, self.testY))

    def costFunctionWrapper(self, params, X, y):
        self.N.setParams(params)
        cost = self.N.costFunction(X, y)
        grad = self.N.computeGradients(X, y)

        return cost, grad

    def train(self, trainX, trainY, testX, testY):
        self.X = trainX
        self.y = trainY
        self.testX = testX
        self.testY = testY

        self.J = []
        self.testJ = []

        params0 = self.N.getParams()

        options = {'maxiter': 1000, 'disp': True}
        _res = optimize.minimize(self.costFunctionWrapper, params0, jac = True, method = 'BFGS', args = (trainX, trainY), options = options, callback = self.callbackF)
        self.N.setParams(_res.x)
        self.optimizationResults = _res


trainX, testX, trainY, testY = cross_validation.train_test_split(X, y, test_size = 0.3, random_state=0)

NN = Neural_Network(Lambda = 0.0003)
numgrad = computeNumericalGradient(NN, X, y)
grad = NN.computeGradients(X,y)
print(np.linalg.norm(grad-numgrad)/np.linalg.norm(grad+numgrad))

T = trainer(NN)
T.train(X, y, testX, testY)

Actual_value = y*5
Predicted_value = NN.forward(X)*5

plot(Actual_value, Predicted_value, 'or')
ylim([0,5])
xlim([0,5])
grid(1)
show()


error = []
Diff = abs(Actual_value - Predicted_value)
for i in Diff:
    if i >= 0.5:
        error.append(i)

#print(Actual_value)
#print()
#print(Predicted_value)
print(max(Diff))        
#print(error)
print(len(error))

1 个答案:

答案 0 :(得分:0)

这是因为改变了错误表面。渐变在错误表面中移动。如果您正确地对输入进行标准化,那么神经网络将很快变为全局最小值。但是如果规范化使得误差表面缩放,则神经网络将走出错误表面并且过程将变慢。有时甚至根本不会去。

enter image description here