Question

尽管标题中的问题是一个特定的问题（仅针对Sigmoid收敛于相同的值，而其他激活功能的成本通常没有降低），但我的网络总体上还是有问题的我花了很多时间进行调试/测试，即使遇到一些小错误，我也无法弄清楚为什么。在第一个训练时期在纸上进行向后传播，与该功能的作用保持一致。

我不想说我不知道网络问题。如果有人可以查看一下，以提示我网络实现的哪些部分错了，我将不胜感激。

我尝试过的事情

不同的激活功能
权重初始化的不同方法（Xavier和随机）

示例（XOR）

20,000次后的结果：

Inputs = 1.000000 0.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028415.
Inputs = 0.000000 1.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028452.
Inputs = 1.000000 1.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028426.
Inputs = 0.000000 0.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028441.

注意

由于我没有提供data.c（因为它将使该帖子花费大量时间，因为它会从csv文件中加载数据），因此请相信input和targetOutput在train的{{1}}方法中，在将model.c和initInput应用到它们之后，它们包含输入initTargetOutput和目标输出的值，[1 or 0, 1 or 0]。我已经完全验证了这一点。

GitHub

包含以下文件的存储库位于here。我很抱歉没有在预编辑的帖子中包含它。请注意，存储库中有很多代码部分，它们被[1 or 0] #if宏包围（一个用于梯度检查，另一个用于切换打印）。这些超出了本文的范围（因此我已在以下文件中将其删除）。

model.c

#else

main.c

#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include "model.h"
#include "functions.h"

/**
 * @param model
 * @param input The head to ann input array of size <code>model.neuronsPerLayer[INPUT_LAYER]</code> that has the inputs
 * of the model.
 */
void setInput(struct Model* model, double input[]) {
    model->values[INPUT_LAYER] = input;
}

void propagateInputForward(struct Model* model, double input[]) {
    setInput(model, input);

    for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
        int startLayerIndex = endLayerIndex - 1;

        int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
        int startNeuronCount = model->neuronsPerLayer[startLayerIndex];

        for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
            double weightedSum = 0.0;
            double bias = model->biases[endLayerIndex][endNeuronIndex];

            for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
                double weight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
                double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];

                double weightedInfluence = weight * startNeuronValue;
                weightedSum += weightedInfluence;
            }

            weightedSum += bias;

            double activatedNeuronValue = model->getActivation(weightedSum);

            model->values[endLayerIndex][endNeuronIndex] = activatedNeuronValue;
        }
    }
}

/**
 * @param model The model which the parameter gradients will be based on.
 * @param layerIndex The layer index whose weight deltas are being calculated.
 * @param baseDelta The base delta, equal to change in the cost function over change in
 * the weighted sum of the neuron value.
 * @param weightGradients The weight gradient to fill.
 * @param biasGradients The bias gradient to fill.
 */
void updateParameterGradients(struct Model *model, const double* targetOutput, double** weightGradients[],
                              double* biasGradients[]) {
    int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];

    // Entry indexed by [layerIndex][neuronIndex] gives
    // Δ C / Δ Z[layerIndex, neuronIndex]
    double* errors[NUMBER_OF_LAYERS];

    errors[OUTPUT_LAYER] = malloc(sizeof(double) * outputNeuronCount);

    // Fill errors of output layers
    for (int outputNeuronIndex = 0; outputNeuronIndex < outputNeuronCount; outputNeuronIndex++) {
        double outputNeuronValue = model->values[OUTPUT_LAYER][outputNeuronIndex];
        double targetOutputNeuronValue = targetOutput[outputNeuronIndex];

        // Δ C_outputNeuronIndex / Δ A[OUTPUT_LAYER][outputNeuronIndex]
        double firstErrorComponent = model->getCostDerivative(outputNeuronValue, targetOutputNeuronValue);
        // Δ A[OUTPUT_LAYER][outputNeuronIndex] / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
        double secondErrorComponent = model->getActivationDerivative(outputNeuronValue);
        // Δ C_outputNeuronIndex / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
        double error = firstErrorComponent * secondErrorComponent;

        errors[OUTPUT_LAYER][outputNeuronIndex] = error; 
    }

    // Fill errors of non-output layers
    for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
        int startLayerIndex = endLayerIndex - 1;

        int startNeuronsCount = model->neuronsPerLayer[startLayerIndex];
        int endNeuronsCount = model->neuronsPerLayer[endLayerIndex];

        errors[startLayerIndex] = malloc(sizeof(double) * startNeuronsCount);

        for (int startNeuronIndex = 0; startNeuronIndex < startNeuronsCount; startNeuronIndex++) {
            double error = 0.0;

            for (int endNeuronIndex = 0; endNeuronIndex < endNeuronsCount; endNeuronIndex++) {
                double nextError = errors[endLayerIndex][endNeuronIndex];
                double nextWeight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];

                double activationValue = model->values[startLayerIndex][startNeuronIndex];
                double activationValueDelta = model->getActivationDerivative(activationValue);

                double errorInfluence = nextWeight * nextError * activationValueDelta;
                error += errorInfluence;
            }

            errors[startLayerIndex][startNeuronIndex] = error;
            }
    }

    // Update weights and biases of all layers based on errors
    for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
        int startLayerIndex = endLayerIndex - 1;

        int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
        int startNeuronCount = model->neuronsPerLayer[startLayerIndex];

        for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
            double endNeuronError = errors[endLayerIndex][endNeuronIndex];

            double biasGradientInfluence = endNeuronError;
            biasGradients[endLayerIndex][endNeuronIndex] += biasGradientInfluence;

            for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
                double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];

                double weightGradientInfluence = endNeuronError * startNeuronValue;
                weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] += weightGradientInfluence;
            }
        }
    }
}

/**
 * Updates the weight and bias values within {@code model}, given the gradients of the cost function
 * with respect to the weights and biases.
 *
 * @param model
 * @param weightGradients
 * @param biasGradients
 */
void updateParameterValues(struct Model* model, double** weightGradients[], double* biasGradients[], int batchSize) {
    for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
        int endNeuronCount = model->neuronsPerLayer[endLayerIndex];

        int startLayerIndex = endLayerIndex - 1;
        int startNeuronCount = model->neuronsPerLayer[startLayerIndex];

        for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
            double biasDelta = biasGradients[endLayerIndex][endNeuronIndex];
            biasDelta /= batchSize;
            biasDelta *= model->learningRate;

            // update bias
            model->biases[endLayerIndex][endNeuronIndex] -= biasDelta;

            for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
                double weightDelta = weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex];

                weightDelta /= batchSize;
                weightDelta *= model->learningRate;

                // update weight
                model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] -= weightDelta;
            }
        }
    }
}

static int epochIndex = 0;

void initGradients(struct Model* model, double** weightGradients[], double* biasGradients[]) {
    for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
        int endNeuronCount = model->neuronsPerLayer[endLayerIndex];

        int startLayerIndex = endLayerIndex - 1;
        int startNeuronCount = model->neuronsPerLayer[startLayerIndex];

        biasGradients[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
        weightGradients[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);

        for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
            biasGradients[endLayerIndex][endNeuronIndex] = 0.0;
            weightGradients[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);

            for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++)
                weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] = 0.0;
        }
    }
}

/**
 * Feeds the input values of the entry into the input array given.
 *
 * @param input
 * @param entry
 * @param inputColumnIndices
 * @param inputColumnIndicesCount
 */
void initInput(double input[], const double entry[], const int inputColumnIndices[], int inputColumnIndicesCount) {
    for (int inputColumnIndex = 0; inputColumnIndex < inputColumnIndicesCount; inputColumnIndex++) {
        int inputColumn = inputColumnIndices[inputColumnIndex];
        input[inputColumnIndex] = entry[inputColumn];
    }
}

/**
 * Feeds the target output values of entry given into the target output array given.
 *
 * @param targetOutput
 * @param entry
 * @param outputColumnIndices
 * @param outputColumnIndicesCount
 */
void initTargetOutput(double targetOutput[], const double entry[], const int outputColumnIndices[], int outputColumnIndicesCount) {
    for (int outputColumnIndex = 0; outputColumnIndex < outputColumnIndicesCount; outputColumnIndex++) {
        int outputColumn = outputColumnIndices[outputColumnIndex];
        targetOutput[outputColumnIndex] = entry[outputColumn];
    }
}

void test(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[], double** predictedOutputs, double costs[]) {
    int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
    int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];

    for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
        double *entry = data->elements[entryIndex];

        double input[inputNeuronCount];
        double targetOutput[outputNeuronCount];

        initInput(input, entry, inputColumnIndices, inputNeuronCount);
        initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);

        // forward propagation
        propagateInputForward(model, input);
        double cost = 0.0;

        for (int outputIndex = 0; outputIndex < outputNeuronCount; outputIndex++) {
            double value = model->values[OUTPUT_LAYER][outputIndex];
            predictedOutputs[entryIndex][outputIndex] = value;

            double targetValue = targetOutput[outputIndex];
            cost += model->getCost(value, targetValue);
        }

        // Take average cost
        cost /= outputNeuronCount;

        costs[entryIndex] = cost;
    }
}

void freeGradients(struct Model* model, double** weightGradients[], double** biasGradients) {
    for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
        free(biasGradients[endLayerIndex]);

        int endNeuronCount = model->neuronsPerLayer[endLayerIndex];

        for (int neuronIndex = 0; neuronIndex < endNeuronCount; neuronIndex++)
            free(weightGradients[endLayerIndex][neuronIndex]);
    }
}

/**
 * Trains the model on the given data.
 *
 * @param model
 * @param data Container for the data the model will be trained on.
 * @param inputColumnIndices The indices of the columns within {@code data} that are the input columns.
 * @param outputColumnIndices The indices of the columns within {@code data} that are the output columns.
 */
void train(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[]) {
    // For both weightGradients and biasGradients, index 0 is not occupied.
    // [endLayerIndex][endNeuronIndex in layerIndex][startNeuronIndex in layerIndex - 1]
    double** weightGradients[NUMBER_OF_LAYERS];
    // [endLayerIndex][endNeuronIndex]
    double* biasGradients[NUMBER_OF_LAYERS];

    // Allocate the storage for the weight and bias deltas, in addition
    // to initializing them all weight and bias deltas with values of 0
    initGradients(model, weightGradients, biasGradients);

    int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
    int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
    epochIndex++;

    // Feed each input into model
    for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
        double* entry = data->elements[entryIndex];

        double input[inputNeuronCount];
        double targetOutput[outputNeuronCount];

        // Feed values of entry into input and targetOutput given indices of input and output columns
        initInput(input, entry, inputColumnIndices, inputNeuronCount);
        initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);

        // forward propagation
        propagateInputForward(model, input);

        // update weight and bias gradients based on this entry, part of the batch
        updateParameterGradients(model, targetOutput, weightGradients, biasGradients);
    }

    updateParameterValues(model, weightGradients, biasGradients, data->numberOfEntries);

    freeGradients(model, weightGradients, biasGradients);
}

/**
 * Allocates the memory for the parameters (weights and biases) of the model, in addition to initializing
 * them to their default values.
 *
 * @param model
 */
void initParameters(struct Model* model) {
    // initialize weights with arbitrary
    for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
        int endNeuronCount = model->neuronsPerLayer[endLayerIndex];

        int startLayerIndex = endLayerIndex - 1;
        int startNeuronCount = model->neuronsPerLayer[startLayerIndex];

        model->weights[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);

        for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
            model->weights[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
            model->biases[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);

            for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
                model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] = model->getInitialWeightValue(startNeuronCount, endNeuronCount);
                model->biases[endLayerIndex][endNeuronIndex] = model->getInitialBiasValue(startNeuronCount, endNeuronCount);
            }
        }
    }
}

/**
 * Allocayes the memory for the values of the model.
 *
 * @param model
 */
void initValues(struct Model* model) {
    for (int layerIndex = 0; layerIndex < NUMBER_OF_LAYERS; layerIndex++) {
        int neuronsInLayer = model->neuronsPerLayer[layerIndex];
        model->values[layerIndex] = malloc(sizeof(double) * neuronsInLayer);
    }
}

functions.c

#include <stdio.h>
#include <stdlib.h>
#include <zconf.h>
#include <time.h>
#include "model.h"
#include "functions.h"
#include "data.h"

#define EPOCH_COUNT 20000
#define NUMBER_OF_COLUMNS 3
#define TRAIN_ENTRIES_SIZE 4
#define TEST_ENTRIES_SIZE 4

int main() {
    time_t currentTime;
    time(&currentTime);
    srand(currentTime);

    struct Model model = {
            .neuronsPerLayer = {2, 2, 1},
            .learningRate = 0.02,

            // Default values
            .getActivation = applySigmoid,
            .getActivationDerivative = applySigmoidDerivative,
            .getCost = getCost,
            .getCostDerivative = getCostDerivative,
            .getInitialWeightValue = getInitialRandomWeight,
            .getInitialBiasValue = getInitialBias,
    };

    int numberOfInputs = model.neuronsPerLayer[INPUT_LAYER];
    int numberOfOutputs = model.neuronsPerLayer[OUTPUT_LAYER];

    // Change working directory so data can be referenced relative to parent data folder
    chdir("..");

    struct Data trainData;
    fill(&trainData, "data/xor/train.csv", NUMBER_OF_COLUMNS, TRAIN_ENTRIES_SIZE);

    struct Data testData;
    fill(&testData, "data/xor/test.csv", NUMBER_OF_COLUMNS, TEST_ENTRIES_SIZE);

    int inputColumnIndices[numberOfInputs];
    int outputColumnIndices[numberOfOutputs];

    inputColumnIndices[0] = 0;
    inputColumnIndices[1] = 1;
    outputColumnIndices[0] = 2;

    initValues(&model);
    initParameters(&model);

    for (int epochIndex = 0; epochIndex < EPOCH_COUNT; epochIndex++)
        train(&model, &trainData, inputColumnIndices, outputColumnIndices);

    exit(0);
}

如果需要其他文件，请提出要求。

用C语言进行XOR的神经网络，所有输出收敛到相同的值

0 个答案: