在神经网络反向传播中计算梯度偏差

时间:2018-03-23 13:28:49

标签: python neural-network gradient gradient-descent mnist

我是神经网络主题的新手,我正在学习python。

我正在尝试教一个神经网络来区分数字和mnist数据库。我有计算梯度偏差的问题。我认为它与f上的渐变相同(在应用重量矩阵之后但在计算sigmoid之前的输出),但似乎我错了 - 数字计算它给出了另一个结果。

在10000个时代之后,无论我使用哪种偏差梯度,我的网络都具有10%的准确度。

你能告诉我我的代码有什么问题(在反向传播功能中)吗?

我的代码:

    # coding: utf-8
    import random
    import numpy as np
    #import pandas as pd
    from tensorflow.examples.tutorials.mnist import input_data

    # Let's read the mnist dataset
    mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

    # In this exercise your task is to fill in the gaps in this code by implementing the backpropagation algorithm
    # Once this is done, you can run the network on the MNIST example and see how it performs. Feel free to play with the parameters.
    # 

    def Loss_function(x,y):
        #takes 2 vertical vectors, output: half of sum of squares of                                 differences
        return 0.5*(np.dot(np.transpose(np.array(x)-np.array(y)),np.array(x)-np.array(y)))

    def sigmoid(z):
        return 1.0/(1.0+np.exp((-1)*(np.array(z))))

    def sigmoid_prime(z):
        # Derivative of the sigmoid
        return sigmoid(np.array(z))*(1-sigmoid(np.array(z)))

    class Network(object):
        def __init__(self, sizes):
            # initialize biases and weights with random normal distr.
            # weights are indexed by target node first
            ##size 
            self.num_layers = len(sizes)
            self.sizes = sizes
            self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
            self.weights = [np.random.randn(y, x) 
                    for x, y in zip(sizes[:-1], sizes[1:])]
            #self.weights are transfer matrices to go from one layer to         next one (then apply sigmoid)
        def feedforward(self, a):
            # Run the network on a single case
            for b, w in zip(self.biases, self.weights):
                a = sigmoid(np.dot(w, a)+b)
            return a
        def feedforwardN(self, B, W, a):
            # Run the network on a single case
            for b, w in zip(B, W):
                a = sigmoid(np.dot(w, a)+b)
            return a

        def update_mini_batch(self, mini_batch, eta):
    # Update networks weights and biases by applying a single step
    # of gradient descent using backpropagation to compute the gradient.
    # The gradient is computed for a mini_batch which is as in tensorflow API.
    # eta is the learning rate
            print("update mini batch")
            nabla_b = [np.zeros(b.shape) for b in self.biases]
            nabla_w = [np.zeros(w.shape) for w in self.weights]
            print(len(mini_batch[0]))
            print(len(mini_batch[1]))
            number=1
            print(number)
            for x, y in zip(mini_batch[0],mini_batch[1]):
                delta_nabla_b, delta_nabla_w = self.backprop(x.reshape(784,1), y.reshape(10,1))
                nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
                nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
                number=number+1
                print(number) 
            print("we are outside the loop now")
            self.weights = [w-(eta/len(mini_batch[0]))*nw 
                            for w, nw in zip(self.weights, nabla_w)]
            self.biases = [b-(eta/len(mini_batch[0]))*nb 
                           for b, nb in zip(self.biases, nabla_b)]
        def backprop(self, x, y):
            # For a single input (x,y) return a pair of lists.
    # First contains gradients over biases, second over weights.

    # First initialize the list of gradient arrays
    # TODO
            gradient_b = [np.zeros(b.shape) for b in self.biases]
            gradient_w = [np.zeros(w.shape) for w in self.weights]

            gradient_bN= gradient_b
            gradient_wN= gradient_w

            # Then go forward remembering all values before and after activations
            # in two other array lists
            # TODO
        # f[i+1] output after apply matrix of weight to g[i]
        # g[i] output after applying sigmoid to f[i]
            f=[np.zeros(s) for s in self.sizes]
            g=[np.zeros(s) for s in self.sizes]
            f[0]=x
            i=1
            g[0]=f[0]
            for b, w in zip(self.biases, self.weights):
                f[i]=(np.dot(w, g[i-1])+b)
                g[i]=sigmoid(f[i])
                i=i+1
            #f[0] i g[0] are not really needed
            # Now go backward from the final cost applying backpropagation
            # TODO
            gradient_g=g
            gradient_f=f
            n=len(g)
            gradient_g[n-1]=(np.array(g[n-1])-np.array(y))
            gradient_f[n-1]=np.multiply(np.multiply(gradient_g[n-1], g[n-1]), (1-g[n-1]))


            for i in range(n-2, 0, -1):
                gradient_g[i]=np.dot(np.transpose(np.array(self.weights[i])), gradient_f[i+1]) 
                gradient_f[i]=np.multiply(np.multiply(gradient_g[i], g[i]), (1-g[i]))

            gradient_b=gradient_f[1:]
            for i in range(0,len(gradient_w)):
                gradient_w[i]=np.outer(gradient_f[i+1], g[i]) 

            #numerical gradient
            epsilon=0.000001
            bN = [np.array(M) for M in self.biases]
            predY= self.feedforwardN(self.biases, self.weights, x)
            predL = Loss_function(predY,y)
            for i in range(0, len(self.biases)):
                for j in range(0, len(self.biases[i])):
                    bN = [np.array(M) for M in self.biases]
                    bN[i][j] = bN[i][j] + epsilon
                    predYE= self.feedforwardN(bN, self.weights, x)
                    gradient_bN[i][j]=(Loss_function(predYE,y)-        Loss_function(predY,y))/epsilon

            differenceB=0.0
            rI=0
            rJ=0
            for i in range(0, len(gradient_b)):
                for j in range(0, len(gradient_b[i])):
                    if (max(np.absolute(differenceB), np.absolute(gradient_bN[i][j]- gradient_b[i][j])) > differenceB):
                        rI=i
                        rJ=j
                    differenceB= max(np.absolute(differenceB), np.absolute(gradient_bN[i][j]- gradient_b[i][j]))

            print ("maximum over entries of difference of numerical and usual gradient of B") 
            print(differenceB)
            print("in entry rI rJ")
            print(rI,rJ)
            print(gradient_b[rI][rJ], gradient_bN[rI][rJ])



            return gradient_b,gradient_w

        def evaluate(self, test_data):
            # Count the number of correct answers for test_data
            test_results = [(np.argmax(self.feedforward(test_data[0][i].reshape(784,1))), np.argmax(test_data[1][i]))
                            for i in range(len(test_data[0]))]
            #print test_results
            return sum(int(x == y) for (x, y) in test_results)

        def cost_derivative(self, output_activations, y):
            return (output_activations-y) 

        def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
            for j in xrange(epochs):
                print("epoch nr")
                print(j)
                self.update_mini_batch(training_data.next_batch(mini_batch_size), eta)
                if test_data:
                    print "Epoch {0}: {1} / {2}".format(
                j,         self.evaluate(test_data.next_batch(mini_batch_size)), mini_batch_size)
                else:
                    print "Epoch {0} complete".format(j)


    network = Network([784,30,10])

   network.SGD(mnist.train,epochs=10000,mini_batch_size=100,eta=3.0,test_data=mnist.test)

0 个答案:

没有答案