MNIST数据集:用numpy创建神经网络,现在无法纠正有关广播的错误

时间:2017-08-21 12:46:48

标签: numpy machine-learning neural-network deep-learning

这是我的代码:

import numpy as np
import random

class Network:
    #layers, biases, weights
    def __init__(self, size):
        self.nr_layers = len(size)
        self.size = size
        self.bias = [np.random.rand(y, 1) for y in size[1:]]
        self.weights = [np.random.randn(x, y) for x, y in zip(size[1:], size[:-1])]

    def feedfoward(self, a):
        #a is activation of last layer(or input)
        for b,w in zip(self.bias, self.weights):
            a = sigmoid(np.dot(w, a) + b)
        return(a)

    def SGD(self, training_data, test_data, nr_epoch, mini_batch_size, learning_rate):
        test_data = list(test_data)
        training_data = list(training_data)
        n_test_data = len(test_data)
        n_training_data = len(training_data)
        #build mini batches
        for i in range(nr_epoch):
            random.shuffle(training_data)
            mini_batches = [training_data[j:j + mini_batch_size]
                            for j in range(0,n_training_data,mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, learning_rate)
            print("Epoch {} : {} / {}".format(i, self.evaluate(test_data), n_test_data))

    def update_mini_batch(self, mini_batch, learning_rate):
        bias_gradient = [np.zeros(b.shape) for b in self.bias]
        weights_gradient = [np.zeros(w.shape) for w in self.weights]
        #summing up gradients for weights and biases(calculate each gradient with backprop)
        for x, y in mini_batch:
            delta_b, delta_w = self.backprop(x, y)
            bias_gradient = [b + db for b, db in zip(bias_gradient, delta_b)]
            weights_gradient = [w + db for w, db in zip(weights_gradient, delta_w)]
        #now we update original weights and biases with gradient descent formula
        self.bias = [b - (learning_rate/len(mini_batch)) * change
                     for b, change in zip(self.bias, bias_gradient)]
        self.weights = [w - (learning_rate/len(mini_batch)) * change
                        for w, change in zip(self.weights, weights_gradient)]

    def backprop(self, x, y):
        bias_gradient = [np.zeros(bias.shape) for bias in self.bias]
        weights_gradient = [np.zeros(weights.shape) for weights in self.weights]
        activation = x
        activations = [x]
        #zs are weighted inputs
        zs = []
        #FEEDFOWARD
        for b, w in zip(self.bias, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        #BACKWARD PASS
        #first last layer(backprop formula #1), then we assign BP3 and BP4
        delta = self.last_layer_cost(activations[-1], y) * sigmoid_derivative(zs[-1])
        bias_gradient = delta
        weights_gradient = np.dot(delta, activations[-2].transpose())
        #now we apply BP formula #2 to all others(l-2) layers, then we assign BP3 and BP4
        #first layer in this loop is last layer before output(a^L)
        for l in range(2, self.nr_layers):
            delta = np.dot(self.weights[-l + 1].transpose(), delta) * sigmoid_derivative(zs[-l])
            bias_gradient = delta
            weights_gradient = np.dot(delta, activations[-l - 1].transpose())
        return weights_gradient, weights_gradient

    def last_layer_cost(self, last_layer_activation, y):
        return(last_layer_activation - y)

    def evaluation(self, test_data):
        test_result = [(np.argmax(self.feedfoward(x), y)) for x, y in test_data]
        return sum(int(x==y) for x, y in test_result)


def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z)*(1-sigmoid(z))

import pickle
import gzip

# Next part is copied from solutions
import numpy as np

def load_data():
    """Return the MNIST data as a tuple containing the training data,
    the validation data, and the test data.
    The ``training_data`` is returned as a tuple with two entries.
    The first entry contains the actual training images.  This is a
    numpy ndarray with 50,000 entries.  Each entry is, in turn, a
    numpy ndarray with 784 values, representing the 28 * 28 = 784
    pixels in a single MNIST image.
    The second entry in the ``training_data`` tuple is a numpy ndarray
    containing 50,000 entries.  Those entries are just the digit
    values (0...9) for the corresponding images contained in the first
    entry of the tuple.
    The ``validation_data`` and ``test_data`` are similar, except
    each contains only 10,000 images.
    This is a nice data format, but for use in neural networks it's
    helpful to modify the format of the ``training_data`` a little.
    That's done in the wrapper function ``load_data_wrapper()``, see
    below.
    """
    f = gzip.open('mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
    f.close()
    return (training_data, validation_data, test_data)

def load_data_wrapper():
    """Return a tuple containing ``(training_data, validation_data,
    test_data)``. Based on ``load_data``, but the format is more
    convenient for use in our implementation of neural networks.
    In particular, ``training_data`` is a list containing 50,000
    2-tuples ``(x, y)``.  ``x`` is a 784-dimensional numpy.ndarray
    containing the input image.  ``y`` is a 10-dimensional
    numpy.ndarray representing the unit vector corresponding to the
    correct digit for ``x``.
    ``validation_data`` and ``test_data`` are lists containing 10,000
    2-tuples ``(x, y)``.  In each case, ``x`` is a 784-dimensional
    numpy.ndarry containing the input image, and ``y`` is the
    corresponding classification, i.e., the digit values (integers)
    corresponding to ``x``.
    Obviously, this means we're using slightly different formats for
    the training data and the validation / test data.  These formats
    turn out to be the most convenient for use in our neural network
    code."""
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = zip(training_inputs, training_results)
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = zip(validation_inputs, va_d[1])
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = zip(test_inputs, te_d[1])
    return (training_data, validation_data, test_data)

def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

################################################################################

training_data, validation_data, test_data = load_data_wrapper()
net = Network([784, 30, 10])
net.SGD(training_data, test_data, 30, 10, 3.0)

and this are solutions.。从解决方案中复制的部分是文件mnist_loader.py中的here

这是我的错误:

Traceback (most recent call last):      File "C:/Users/PycharmProjects/MachineLearning/ex.py", line 157, in <module>
    net.SGD(training_data, test_data, 30, 10, 3.0)
  File "C:/Users/PycharmProjects/MachineLearning/ex.py", line 29, in SGD
    self.update_mini_batch(mini_batch, learning_rate)
  File "C:/Users/PycharmProjects/MachineLearning/ex.py", line 39, in update_mini_batch
    weights_gradient = [w + db for w, db in zip(weights_gradient, delta_w)]
  File "C:/Users/PycharmProjects/MachineLearning/ex.py", line 39, in <listcomp>
    weights_gradient = [w + db for w, db in zip(weights_gradient, delta_w)]
ValueError: operands could not be broadcast together with shapes (10,30) (784,)

我是DL的初学者,并且不知道python和numpy超过2-3个月,但我知道什么是广播...但我无法解决这个问题所以请求任何人看看这个,并建议我如何解决它? 对我来说最让人困惑的是这条线与解决方案相同(我尝试过这种方法)。

哦,简短的术语评论: nabla_b,nabla_w是bias_gradient,我的版本中的weights_gradient

1 个答案:

答案 0 :(得分:0)

我认为在self.backprop中,第一个返回的变量应该是bias_gradient,而不是weights_gradient。

只是一个小提示:我认为将批量大小更改为不同于10的大小可能有助于区分错误中的10是批量大小还是输出层大小。我曾经听说过,2的幂是计算效率的,但我不确定:)