Question

我理解具有反向传播的神经网络应该如何工作。我知道如何使用Python自己的 MLPClassifier 和 fit 函数在sklearn中工作。我正在创建自己的，因为我想更好地了解细节。我将首先展示我的代码（带注释），然后讨论我的问题。

import numpy as np
import scipy as sp
import sklearn as ML

# z: the linear combination of the previous layer
#
# returns the activation for the node
#
def sigmoid(z):

    a = 1 / (1 + np.exp(-z))

    return a

# z: the contribution of a layer
#
# returns the derivative of the sigmoid evaluated at z   
# 
def sig_grad(z):

    d = (1 - sigmoid(z))*sigmoid(z)

    return d

# input: the data we want to train the network with
# hidden_layers: the number of nodes in the hidden layers
# num_layers: how many hidden layers between the input layer and the output     layer
# num_output: how many outputs there are... this becomes relevant when we     input many features.
#
# returns the activations determined
#         and the linear combinations of previous layer's nodes for each     layer
#
def feedforward(input, hidden_layers, num_layers, num_output, thresh,     weights):

#initialize the vector for inputs AND threshold values
X = np.hstack([thresh[0], input])
#intialize the activations list
A = []
#intialize the linear combos for each layer
Z = []
w = list(weights)

#place ones in the first row of each layer of weights for the threshold
w[0] = np.vstack([np.ones([1,hidden_layers]), w[0]])

for i in range(1,num_layers): 
        w[i] = np.vstack([np.ones([1,hidden_layers]), weights[i]])

w[-1] = np.vstack([np.ones([1,num_output]), w[-1]])

#the first layer of weights are initialized outside function
#cycle through the hidden layers
for i in range(1, num_layers+1):

    Z.append( np.dot(X, w[i-1])); S = sigmoid(Z[i-1]); A.append(S); X = np.hstack([thresh[i], A[i-1]])

#find the output/last layer activations
Z.append( np.dot(X, w[-1]) ); S = sigmoid(Z[-1]); A.append(S);

return A, Z

#
# truth: what we know the output should be
# activations: the activations determined at each node by the sigmoid 
#               function in the previous feedforward pass
# combos: the linear combinations at each layer in the prev. ff pass
# num_layers: the number of hidden layers
#
# error: the errors determined at each layer; will be needed for gradient     descent
#
def backprop(input, truth, activations, combos, num_layers, weights):

    #initialize an array of errors for each hidden layer and the output layer
    error = [0 for x in range(0,num_layers+1)]
    #intialize the lists containing the gradients w.r.t. weights and threshold
    derivW = []; derivb = []

    #set the output layer since its error is computed differently than the others
    error[num_layers] = (activations[num_layers] - truth)*sig_grad(combos[num_layers])
    #find the rate of change for weights and thresh for connections to output
    derivW.append( activations[num_layers-1]*error[num_layers]); derivb.append(np.sum(error[num_layers]))

    if(num_layers > 1):
    #find the errors for each of the hidden layers
    for i in range(num_layers - 1, 0, -1):
        error[i] =  np.dot(weights[i+1],error[i+1])*sig_grad(combos[i])
        derivW.append( np.outer(activations[i-1], error[i]) ); derivb.append(np.sum(error[i]))

    #    
    #finding the derivative for weights of input to next layer
    #    
    error[0] = np.dot(weights[i],error[i])*sig_grad(combos[0])        
    derivW.append( np.outer(input, error[0]) ); derivb.append(np.sum(error[0]))

    return derivW, derivb   

#
# weights: our networks weights to update via gradient descent
# thresh: the threshold values to update for our system
# derivb: the derivative of our cost function with respect to b for each  layer
# derivW: the derivative of our cost function with respect to W for each layer
# stepsize: the stepsize we want to take, determines how big of a step we take
#
# returns the updated weights and threshold values for our network
def gradDesc(weights, thresh, derivb, derivW, stepsize, num_layers):

    #perform gradient descent
    for j in range(100):
        for i in range(0, num_layers + 1):

            weights[i] = weights[i] - stepsize*derivW[num_layers-i]
            thresh[i] = thresh[i] - stepsize*derivb[num_layers-i]


    return weights, thresh


#input: the data to send through the network
#hidden_layers: the number of hidden_layers between the input layer and the output layer
#num_layers: the number of nodes in the hidden layer
#num_output: the number of nodes in the output layer
#
#returns the output of the network
#
def nNetwork(input, truth, hidden_layers, num_layers, num_output, maxiter, stepsize):

    #assuming that input is an array where each element is an input/sample
    #we also need to know the size of each sample itself
    m = input.size

    thresh = np.random.randn(num_layers + 1, 1)
    thresh_weights = np.ones([num_layers + 1, 1])

    # initialize the weights as a list because each layer might have
    # a different number of weights
    weights = []; weights.append(np.random.randn(m,hidden_layers)); 

    if( num_layers > 1):
        for i in range(1, num_layers):

            weights.append(np.random.randn(hidden_layers, hidden_layers))

    weights.append(np.random.randn(hidden_layers, num_output)) 

    for i in range(maxiter):

        activations, combos = feedforward(input, hidden_layers, num_layers, num_output, thresh, weights)

        derivW, derivb = backprop(input, truth, activations, combos, num_layers, weights)

        weights, thresh = gradDesc(weights, thresh, derivb, derivW, stepsize, num_layers)

    return weights, thresh

def main():

    # a very, very simple neural network
    input = np.array([1,0,0])
    truth = 0
    hidden_layers = 3
    num_layers = 2
    num_output = 1

    #train the network
    w, t = nNetwork(input, truth, hidden_layers, num_layers, num_output, maxiter = 10, stepsize = 0.001)

    #test the network on a new set of arguments
    #activations, combos = feedforward(new_input, hidden_layers = 3, num_layers = 2, thresh = t, weights = w)

main()

我已经在简单的例子中测试了这段代码，其中有一个n维的输入和n维的输出（当我输入NN.py进入控制台时，还不能解决错误，但是当我运行时工作它一块一块地在控制台中）。我有几个问题可以帮助我更好地了解当我输入有m个维度时发生了什么。例如，Python中的数字数据（有1797个样本，每个样本为64x1 - 8x8图像矢量化）。

1）64个像素中的每个像素都被视为输入吗？如果是这样，神经网络是否一次训练一个图像？这对我来说很容易解决。

2）如果神经网络同时训练了所有图像，那么修改我的代码的建议是什么？

3）显然，图像的输出以0,1,2,3，...或9的形式出现。但是，输出是否以向量10x1的形式出现，其中有1 in图像代表的数字和其他地方的0？那么，我的预测向量在1可能的位置具有最高值，对吗？

4）然后，如果＃2为真，我不太确定＃3会是什么样的。

我为长篇笔记道歉。谢谢你看一看并帮助我更好地理解！

第一神经网络（MLP），来自Scratch，Python - 问题

0 个答案: