Question

我正在尝试完成我的神经网络，该网络使用的是MNIST手写数字数据库，该数据库使用了反向传播算法。我的代码如下所示-我将尝试对其进行精简。

class NeuralNetwork:
    def _init_(...):
        #Here I am just assigning some values and also parameters weight and bias
        #L - len(number of layers)
        for i in range (1, L):
            self.parameters["W" + str(i)] = np.random.randn(self.ld[i-1], self.ld[i])*0.01
            self.parameters["b" + str(i)] = np.zeros((1, self.ld[i]))

    def linear_forward(A_prev, W, b):
        #A_prev - activation value from previous layer  |  W - weight  |  b - bias
        Z = np.dot(A_prev, W) + b
        cache = (Z, A_prev, W, b)
        #storing some important variables into the cache because of backpropagation
        return cache

    def linear_activation_forward(A_prev, W, b, activation):
        #activation will be a string like "relu", "sigmoid" -> this will determine which activation function will be used
        logits = [] #later for softmax
        cache = linear_forward(A_prev, W, b)
        if activation == "sigmoid":
            A, Z = sigmoid(Z)
        #.
        #.
        #.
        elif ativation == "softmax":
            logits, Z = softmax(Z)

        newCache = (Z, A_prev, W, b)
        return A, newCache, logits
    #I will also include my activation functions sigmoid and softmax, even though there are not needed for understanding my problem
    def sigmoid(z):
        A = 1/(1+np.exp(-z))
        assert(A.shape == z.shape)
        zCache = z
        return A, zCache

   def softmax(z):
        exp = np.exp(z)
        assert(exp.shape == z.shape)
        logits = exp/np.sum(exp, axis = 0, keepdims = True)
        return logits, z

    def FeedForward(A_prev, parameters, activation):
        L = len(parameters) //2
        #parameters has weight and bias inside
        caches = [] #this will have all my caches inside
        for l in range(1, L):
            A, cache, _ = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], activation)
            caches.append(cache)
        Al, cache, logits = linear_activation_forward(A, parameters["W" + str(L)], parameters["b"+str(L)], "softmax")
        caches.append(cache)
        return caches, Al, logits

    def CategoricalCrossEntropy(y, yhat):
        #I have been quite a lot struggling with CrossEntropy and I am still not sure, if I am using the right one
        #Another version I have found is this one -> loss = -(np.sum(y) * np.sum(np.log(yhat)))
        return (yhat-y)

    ###Backpropagation###
    def linear_backward(dZ, linearCache):
        A_prev, W, b = linearCache
        m = A_prev.shape[1]
        #dW, db and dA_prev are derivatives of weight, bias and activation value from previous layer
        dW = (1/m)*np.dot(A_prev.T, dZ)
        db = (1/m)*np.sum(dZ, axis = 0, keepdims = True)
        dA_prev = np.dot(dZ, W.T)
        return dA_prev, dW, db

    def linear_activation_backward(dA, currentCache, activation):
        Z, A_prev, W, b = current_cache
        linearCache = (A_prev, W, b)
        if activation == "sigmoid":
            dZ = sigmoid_backward(dA, Z)
        #.
        #.
        #.
        elif activation == "softmax":
            dZ = softmax_backward(dA, Z)
        dA_prev, dW, db = linear_backward(dZ, linearCache)
        return dA_prev, dW, db

    def sigmoid_backward(dA, Z):
        A, Z = sigmoid(Z)
        dZ = ((1-A)*dA)
        return dZ

    def softmax_backward(dA, Z):
        #x, _ = softmax(dA)
        #s = x.reshape(-1,1)
        #dZ = np.diagflat(s) - np.dot(s, s.T)
        J = - dA[..., None] * dA[:, None, :] # off-diagonal Jacobian
        iy, ix = np.diag_indices_from(J[0])
        J[:, iy, ix] = dA * (1. - dA) # diagonal
        return J.sum(axis=1)

    def Backpropagation(dAl, caches, activation):
        #dAl - the derivative of the activation value from last layer
        L = len(parameters) //2 
        current_cache = caches[L-1]
        grads = {}
        grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = self.linear_activation_backward(dAl, current_cache, "softmax")
        for l in reversed(range(L-1)):
            current_cache = caches[l]
            grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = self.linear_activation_backward(grads["dA" + str(l+1)], current_cache, activation)
        return grads

    def Gradient_Descent(grads, parameters, learning_rate):
       L = len(parameters)//2
           for l in range(L):
               self.parameters["W" + str(l+1)] = self.parameters["W" + str(l+1)] - grads["dW" + str(l+1)]*learning_rate
               self.parameters["b" + str(l+1)] = self.parameters["b" + str(l+1)] - grads["db" + str(l+1)]*learning_rate

            return parameters


    def train(X, Y):
        A = X
        labels = Y
        grads = {}
        parameters
        learning_rate = 0.01
        for epochs in range(self.epochs):
            A_prev = A
            caches, Al, logits = Feedforward(A_prev, parameters, self.activation)
            loss = CategoricalCrossEntropy(labels, softmax(Al))

            dA = np.divide(Al - Y, np.divide(Al, 1-Al))
            dAl = -(np.divide(Y, Al) - np.divide(1 - Y, 1 - Al))
            #not sure which one to choose, both of them makes loss NaN
            grads = Backpropagation(dAl, caches, self.activation)
            optimizer == "gradientDescent":
                parameters = Gradient_Descent(grads, parameters, learning_rate)

这一切都是这样的损失：

Loss: [[[ 1.00582113e-04  9.72306300e-05  1.01261971e-04 ... -9.99894497e-01
    1.11269120e-04  9.89210011e-05]
  [ 1.16420796e-04  1.11381422e-04 -9.99906525e-01 ...  9.33883929e-05
    1.10171812e-04  1.03851887e-04]
  [ 1.01710476e-04 -9.99897105e-01  9.09620320e-05 ...  1.01604678e-04
    1.00675631e-04  1.07537724e-04]

这绝对是错误的，经过几次迭代后，它输出以下内容：

Loss: [[[nan nan nan ... nan nan nan]
  [nan nan nan ... nan nan nan]
  [nan nan nan ... nan nan nan]
  ...
  [nan nan nan ... nan nan nan]
  [nan nan nan ... nan nan nan]
  [nan nan nan ... nan nan nan]]

哪一个更错误

有人可以告诉我，我做错了什么吗？如果需要，我可以将您的整个文件发送给您

感谢您的帮助！

我的损失非常巨大或NaN-从无到有的神经网络

0 个答案: