Keras模型和使用numpy从头开始构建的模型的结果存在异常差异

时间：2019-07-05 10:37:00

标签： neural-network deep-learning keras-layer mnist tf.keras

我已经使用Numpy从头开始实现了规则化的神经网络，以对MNIST数据集进行分类。将我的模型结果与tf.Keras的Sequential结果进行比较时，结果差异很大。

两个网络中的所有超参数（包括batch_size，learning_rate，lambda，即正则化率，历元）都相同。我已经从零初始化了所有内核，并使用了完整的批处理梯度下降并将shuffle设置为False，以消除任何随机性。这些模型是使用相同的优化器（SGD），损失函数（MSE），正则化函数（l2-regularization）和激活函数（所有层均为Sigmoid）构建的。

但是有趣的是，当我将λ设置为0时，结果（trainingLoss，准确性，权重，梯度等）完全匹配。所以我怀疑问题在于Keras实现正则化器的方式与我实现方式的关系。

Keras模型：

def kerasModel(lamda, lr):
    model = Sequential()
    model.add(Dense(30, input_dim=784, activation="sigmoid", 
                    kernel_regularizer=l2(lamda),
                    kernel_initializer ='zeros', # rn(stddev=1)
                    bias_initializer = 'zeros'))
    model.add(Dense(10, activation="sigmoid",
                    kernel_regularizer = l2(lamda),
                    kernel_initializer = 'zeros', #rn(stddev=1), 
                    bias_initializer = 'zeros'))
    optimizer = tf.keras.optimizers.SGD(lr=lr)
    model.compile(optimizer=optimizer,
                  loss='mse',
                  metrics=['accuracy'])
    return model

我的模特：

class MyNeuralNet:
    def __init__(self, layerArray):
        """ 
        layerArray: array of dimensions of layers. 
        size of layerArray is the number of layers in our network 
        """
        self.layers = layerArray
        self.B = []
        self.W = []
        self.input = None
        for layerNum in range(1, len(layerArray)): #1st layer is input so we exclude that
            biasVector = np.zeros((layerArray[layerNum], 1))
            self.B.append(biasVector)
            weightsMatrix = np.zeros((layerArray[layerNum], layerArray[layerNum-1]))
            self.W.append(weightsMatrix)

    def calcCost(self, data, lamda):
        """" Calculates the loss """
        j = 0
        for data_ in data:
            yPred = self.forwardPass(data_[0])
            y = data_[1].reshape((10,1))
            costEx = cost(yPred, y) #cost of one example
            j += sum(costEx)
        j = j/len(data)
        j += 0.5*(lamda)*sum(
            [np.linalg.norm(w)**2 for w in self.W])
        return j

    def netSize(self):
        """ number of layers in the network excluding the input layer"""    
        return len(self.layers) - 1

    def activateLayer(self, z, func = 0):
        ''' applies specified activation function to the layer z. 
        0 for Sigmoid
        1 for Relu '''
        if func == 1:
            activatedLayer = relu(z)
            return activatedLayer
        if not func:
            activatedLayer = sigmoid(z)
            return activatedLayer
        else:
            raise Exception("Activation function can either be 0 (sigmoid) or 1 (relu)")

    def derivatieActivateLayer(self, z, func = 0):
        """ applies derivate of specified activation function to the layer z. 
        0 for Sigmoid
        1 for Relu"""
        if func == 1:
            z = np.array(z)
            return np.dot(1, z>0)
        if not func:
            z = np.array(z)
            sigmoid = self.activateLayer(z, 0)
            return sigmoid*(1-sigmoid)
        else:
            raise Exception("Activation function can either be 0 (sigmoid) or 1 (relu)")

    def forwardPass(self, layer, func = 0):
        """ Outputs the output layer by performing a forward pass """
        layer = layer.reshape((784,1)) 
        for i in range(self.netSize()):
            layer = np.dot(self.W[i], layer) + self.B[i]
            layer = self.activateLayer(layer, func)
        return layer

    def backPropagate(self, x, y, func = 0):
        """
        Backpropagates through the network to compute gradients.
        """
        #initializing gradients
        dW = []
        dB = []
        for i in range(self.netSize()):
            dW.append(np.zeros(self.W[i].shape))
            dB.append(np.zeros(self.B[i].shape))
        outputLayers = [] #Z's 
        activeOutputLayers = [] #Sigmoid of Z's or g(Z)
        x=x[0].reshape((784,1)) 
        activeOutput = x #input layer 
        activeOutputLayers.append(activeOutput)

        for b,w in zip(self.B, self.W):
            output = np.dot(w, activeOutput) + b
            outputLayers.append(output)
            activeOutput = self.activateLayer(output, func)
            activeOutputLayers.append(activeOutput)

        outputLayers = np.array(outputLayers)
        activeOutputLayers = np.array(activeOutputLayers)
        n = self.netSize()
        dZ = derivateCost(activeOutput, y) * self.derivatieActivateLayer(output, func)
        dW[n-1] = np.dot(dZ, activeOutputLayers[-2].T)
        dB[n-1] = dZ
        for l in range(n-1):
            dZ = np.dot(self.W[n-1-l].T, dZ) * self.derivatieActivateLayer(outputLayers[n-2-l], func)
            dB[l] = dZ
            dW[l] = np.dot(dZ , activeOutputLayers[max(0,n-3-l)].T)

        dB = np.array(dB)
        dW = np.array(dW)
        return (np.array(dB), np.array(dW))


    def train(self, train, validation, epochs, batchSize, lr, lamda = 0, func = 0):
        """
        Performs gradient descent and updates the network.
        train: training data
        validation: validation data
        epochs: number of iterations
        lamda: regularization rate 
        batchSize: the size of the batch for gradient descent.  
        func: activation function, func = 0 means sigmoid.
        """
        for i in range(epochs):
            for batch in dataIter(batchSize, train):
                xBatch, yBatch = batch[:, :-1], batch[:, -1]
                dW = []
                dB = []
                for j in range(self.netSize()):
                    dW.append(np.zeros(self.W[j].shape))
                    dB.append(np.zeros(self.B[j].shape))
                for x, y in zip(xBatch, yBatch):
                    gradB, gradW = self.backPropagate(x, y, func)
                    n = self.netSize()
                    #summing weights and biases for all examples in the mini batch
                    dW = [w + gradw for w, gradw in zip(dW, gradW)]
                    dB = [b + gradb for b, gradb in zip(dB, gradB)]

                for j in range(self.netSize()):
                    self.W[j] = self.W[j]*(1-(lamda)*lr) - (lr/batchSize)*dW[j] 
                    self.B[j] = self.B[j] - (lr/batchSize)*dB[j]

========== ** Helper Functions ** ============
import random
import numpy as np

def sigmoid(z):
    return 1/(1+np.exp(-z))

def relu(z):
    return np.maximum(0, z)

def cost(yPred, y):
    return 0.1*(yPred-y.reshape((10,1)))**2

def derivateCost(yPred, y):
    return 0.2*(yPred - y.reshape((10,1)))

def dataIter(batchSize, data):
    #random.shuffle(data)
    batches = []
    for i in range(0, data.shape[0], batchSize):
        batches.append(data[i:i+batchSize])
    return batches

0 个答案:

没有答案