Question

这是从中等职位获得的代码。我是神经网络的新手，我希望有人向我解释（如果可以的话，不要只是给我一个答案，请解释一下）如何将代码中提供的批量大小更改为SGD批量大小。预先谢谢你！

代码

NeuralNetwork（object）类：

# The constructor takes 
# 1. layers representing the number of nodes
# 2. activations representing the activation functions to choose in
#    each layer. 

def __init__(self, layers = [2 , 10, 1], activations=['sigmoid', 'sigmoid']):

    # check to make sure that no. of layers is one more than 
    # no. of activation functions because the input layer 
    # has no activation function.

    assert(len(layers) == len(activations)+1)

    # define the local variables layers and activations
    self.layers = layers
    self.activations = activations

    # initialize weights and biases as two lists to hold
    # weights and biases for each layer

    self.weights = []
    self.biases = []

    # create random weights for biases and weights
    # for each of the layes.

    for i in range(len(layers)-1):
        self.weights.append(np.random.randn(layers[i+1], layers[i]))
        self.biases.append(np.random.randn(layers[i+1], 1))

# do feedforward for x
# where x is an input

# return a list of a's and z's as expected.
# a's and z's are layer by layer

def feedforward(self, x):
    # make a copy of x
    a = np.copy(x)

    # this variable will contain all the z's
    z_s = []

    # this variable will contain all the a's
    # the output of the input layer is simply x which is the
    # input. So we initialize the a_s to contain a. 
    a_s = [a]

    # for each layer do
    for i in range(len(self.weights)):

        # retrieve the appropriate activation function
        activation_function = self.getActivationFunction(self.activations[i])

        # create z_s by z = w.a + b for each layer
        z_s.append(self.weights[i].dot(a) + self.biases[i])

        # create z_a by a = f(row) -- 
        # note that we apply it to the last element only 
        # by using the -1 notation. We only want to apply
        # the activation function to the last layer just 
        # added.

        a = activation_function(z_s[-1])

        # keep track of the new activation or a_s 
        a_s.append(a)

        # return both z_s and a_s 
        # we will have z_s and a_s for each layer 
    return (z_s, a_s)


# takes the y -- the actual answer, a's and z's and 
# calculates the dLoss/dw and dLoss/db

def backpropagation(self,y, z_s, a_s, normalize='no',ignore_b='no'):

    # initialize list of dLoss/dw and dLoss/db
    dw = []  # dLoss/dw
    db = []  # dLoss/db

    # create an empty list of deltas, one for each weight
    deltas = [None] * len(self.weights)  # delta = dLoss/dz  known as error for each layer


    # start from the back and insert the last layer error
    # based on the square loss function. Note -1 is used to 
    # fill things from the back of the list 
    # also note that we need to use the derivative function 
    # for the activation function.
    # note that we do not need to use the 2 in the loss function derivation

    # again note this is for the last layer only!

    deltas[-1] = ((y-a_s[-1])*(self.getDerivitiveActivationFunction(self.activations[-1]))(z_s[-1]))


    # Perform BackPropagation

    # for the rest of the deltas, go in reverse order
    for i in reversed(range(len(deltas)-1)):
        deltas[i] = self.weights[i+1].T.dot(deltas[i+1])*(self.getDerivitiveActivationFunction(self.activations[i])(z_s[i]))        

    # now we need to update the weights based on the calculated
    # deltas

    #now we will determine the batch size from the first dimension 
    #of shape of y. We simply want to see how many test cases are there
    #for example there may be 10 y's; one for each x. 

    batch_size = y.shape[1]

    # determine the two derivatives by taking 
    # the average according to batch sizes 

    db = [d.dot(np.ones((batch_size,1)))/float(batch_size) for d in deltas]  
    dw = [d.dot(a_s[i].T)/float(batch_size) for i,d in enumerate(deltas)]

    #for i,d in enumerate(dw):
    #  print("dw[",i,"]=", d,"\n")

    #Q3 normalize if normalize is anything but no.
    if(normalize!='no'):
      dw = [d/np.linalg.norm(d) for i,d in enumerate(dw)]
      #for i,d in enumerate(dw):
      # print("dwn[",i,"]=", d,"\n")

    #Q4 ignore b's
    if(ignore_b!='no'):
      db = [db*0 for i,d in enumerate(db)]
      #for i,d in enumerate(dw):
      # print("dwn[",i,"]=", d,"\n")

    # return the derivitives respect to weight matrix and biases
    return dw, db


# Now we will write the main training function that uses
# feedforward and backpropagation many times (called epochs)
# lr (learning rate) is the eta in our equations.

def train(self, x, y, batch_size=10, epochs=100, lr = 0.01, normalize='no', ignore_b='no'):

# create an array to store the loss for Q1
    saved_loss = np.zeros((epochs,2))

# update weights and biases based on the output
# for the number of epochs

    for e in range(epochs): 
        i=0

        # Do the training in batches
        # each batch is a subset of the original 
        # data 

        while(i<len(y)):

            # extract a batch
            x_batch = x[i:i+batch_size]
            y_batch = y[i:i+batch_size]

            # update i for the next batches
            i = i+batch_size

            # do the feedforward for the batch and update the weights
            # based on the average loss for each weight for the whole
            # batch.

            z_s, a_s = self.feedforward(x_batch)

            # do the back propagation 
            dw, db = self.backpropagation(y_batch, z_s, a_s, normalize)


            # update the weights for each pair of weights and dw
            # and biases and db

            self.weights = [w+lr*dweight for w,dweight in  zip(self.weights, dw)]
            self.biases = [w+lr*dbias for w,dbias in  zip(self.biases, db)]

            # print the loss using a built in function 
            # to calculate the loss

            # calculate loss

            loss = np.linalg.norm(a_s[-1]-y_batch)

            saved_loss[e,0] = e
            saved_loss[e,1] = loss

            #print("loss = ", np.linalg.norm(a_s[-1]-y_batch) )

    # for Q1
    return saved_loss

# This function is being used to return an activation function 
# depending on its weights

@staticmethod
def getActivationFunction(name):
    if(name == 'sigmoid'):
        return lambda x : np.exp(x)/(1+np.exp(x))
    elif(name == 'linear'):
        return lambda x : x
    elif(name == 'relu'):
        def relu(x):
            y = np.copy(x)
            y[y<0] = 0
            return y
        return relu
    # Q2
    elif(name == 'leaky_relu'):
        def leaky_relu(x):
            y = np.copy(x)
            y[y<0] *= 0.01
            return y
        return leaky_relu
    else:
        print('Unknown activation function. linear is used')
        return lambda x: x

# This function returns the derivative of a function depending
# on its name.

@staticmethod
def getDerivitiveActivationFunction(name):
    if(name == 'sigmoid'):
        sig = lambda x : np.exp(x)/(1+np.exp(x))
        return lambda x :sig(x)*(1-sig(x)) 
    elif(name == 'linear'):
        return lambda x: 1
    elif(name == 'relu'):
        def relu_diff(x):
            y = np.copy(x)
            y[y>=0] = 1
            y[y<0] = 0
            return y
        return relu_diff
    #Q2 
    elif(name=='leaky_relu'):
        def leaky_relu_diff(x):
          dx = np.ones_like(x)
          dx[x < 0] = 0.01
          return dx
        return leaky_relu_diff
    else:
        print('Unknown activation function. linear is used')
        return lambda x: 1

随机梯度下降（SGD）

0 个答案: