尽管标题中的问题是一个特定的问题(仅针对Sigmoid收敛于相同的值 ,而其他激活功能的成本通常没有降低),但我的网络总体上还是有问题的我花了很多时间进行调试/测试,即使遇到一些小错误,我也无法弄清楚为什么。在第一个训练时期在纸上进行向后传播,与该功能的作用保持一致。
我不想说我不知道网络问题。如果有人可以查看一下,以提示我网络实现的哪些部分错了,我将不胜感激。
我尝试过的事情
示例(XOR)
20,000次后的结果:
Inputs = 1.000000 0.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028415.
Inputs = 0.000000 1.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028452.
Inputs = 1.000000 1.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028426.
Inputs = 0.000000 0.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028441.
注意
由于我没有提供data.c
(因为它将使该帖子花费大量时间,因为它会从csv
文件中加载数据),因此请相信input
和targetOutput
在train
的{{1}}方法中,在将model.c
和initInput
应用到它们之后,它们包含输入initTargetOutput
和目标输出的值,[1 or 0, 1 or 0]
。我已经完全验证了这一点。
GitHub
包含以下文件的存储库位于here。我很抱歉没有在预编辑的帖子中包含它。请注意,存储库中有很多代码部分,它们被[1 or 0]
#if
宏包围(一个用于梯度检查,另一个用于切换打印)。这些超出了本文的范围(因此我已在以下文件中将其删除)。
model.c
#else
main.c
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include "model.h"
#include "functions.h"
/**
* @param model
* @param input The head to ann input array of size <code>model.neuronsPerLayer[INPUT_LAYER]</code> that has the inputs
* of the model.
*/
void setInput(struct Model* model, double input[]) {
model->values[INPUT_LAYER] = input;
}
void propagateInputForward(struct Model* model, double input[]) {
setInput(model, input);
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int startLayerIndex = endLayerIndex - 1;
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double weightedSum = 0.0;
double bias = model->biases[endLayerIndex][endNeuronIndex];
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double weight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];
double weightedInfluence = weight * startNeuronValue;
weightedSum += weightedInfluence;
}
weightedSum += bias;
double activatedNeuronValue = model->getActivation(weightedSum);
model->values[endLayerIndex][endNeuronIndex] = activatedNeuronValue;
}
}
}
/**
* @param model The model which the parameter gradients will be based on.
* @param layerIndex The layer index whose weight deltas are being calculated.
* @param baseDelta The base delta, equal to change in the cost function over change in
* the weighted sum of the neuron value.
* @param weightGradients The weight gradient to fill.
* @param biasGradients The bias gradient to fill.
*/
void updateParameterGradients(struct Model *model, const double* targetOutput, double** weightGradients[],
double* biasGradients[]) {
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
// Entry indexed by [layerIndex][neuronIndex] gives
// Δ C / Δ Z[layerIndex, neuronIndex]
double* errors[NUMBER_OF_LAYERS];
errors[OUTPUT_LAYER] = malloc(sizeof(double) * outputNeuronCount);
// Fill errors of output layers
for (int outputNeuronIndex = 0; outputNeuronIndex < outputNeuronCount; outputNeuronIndex++) {
double outputNeuronValue = model->values[OUTPUT_LAYER][outputNeuronIndex];
double targetOutputNeuronValue = targetOutput[outputNeuronIndex];
// Δ C_outputNeuronIndex / Δ A[OUTPUT_LAYER][outputNeuronIndex]
double firstErrorComponent = model->getCostDerivative(outputNeuronValue, targetOutputNeuronValue);
// Δ A[OUTPUT_LAYER][outputNeuronIndex] / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
double secondErrorComponent = model->getActivationDerivative(outputNeuronValue);
// Δ C_outputNeuronIndex / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
double error = firstErrorComponent * secondErrorComponent;
errors[OUTPUT_LAYER][outputNeuronIndex] = error;
}
// Fill errors of non-output layers
for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
int startLayerIndex = endLayerIndex - 1;
int startNeuronsCount = model->neuronsPerLayer[startLayerIndex];
int endNeuronsCount = model->neuronsPerLayer[endLayerIndex];
errors[startLayerIndex] = malloc(sizeof(double) * startNeuronsCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronsCount; startNeuronIndex++) {
double error = 0.0;
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronsCount; endNeuronIndex++) {
double nextError = errors[endLayerIndex][endNeuronIndex];
double nextWeight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
double activationValue = model->values[startLayerIndex][startNeuronIndex];
double activationValueDelta = model->getActivationDerivative(activationValue);
double errorInfluence = nextWeight * nextError * activationValueDelta;
error += errorInfluence;
}
errors[startLayerIndex][startNeuronIndex] = error;
}
}
// Update weights and biases of all layers based on errors
for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
int startLayerIndex = endLayerIndex - 1;
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double endNeuronError = errors[endLayerIndex][endNeuronIndex];
double biasGradientInfluence = endNeuronError;
biasGradients[endLayerIndex][endNeuronIndex] += biasGradientInfluence;
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];
double weightGradientInfluence = endNeuronError * startNeuronValue;
weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] += weightGradientInfluence;
}
}
}
}
/**
* Updates the weight and bias values within {@code model}, given the gradients of the cost function
* with respect to the weights and biases.
*
* @param model
* @param weightGradients
* @param biasGradients
*/
void updateParameterValues(struct Model* model, double** weightGradients[], double* biasGradients[], int batchSize) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double biasDelta = biasGradients[endLayerIndex][endNeuronIndex];
biasDelta /= batchSize;
biasDelta *= model->learningRate;
// update bias
model->biases[endLayerIndex][endNeuronIndex] -= biasDelta;
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double weightDelta = weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex];
weightDelta /= batchSize;
weightDelta *= model->learningRate;
// update weight
model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] -= weightDelta;
}
}
}
}
static int epochIndex = 0;
void initGradients(struct Model* model, double** weightGradients[], double* biasGradients[]) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
biasGradients[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
weightGradients[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
biasGradients[endLayerIndex][endNeuronIndex] = 0.0;
weightGradients[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++)
weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] = 0.0;
}
}
}
/**
* Feeds the input values of the entry into the input array given.
*
* @param input
* @param entry
* @param inputColumnIndices
* @param inputColumnIndicesCount
*/
void initInput(double input[], const double entry[], const int inputColumnIndices[], int inputColumnIndicesCount) {
for (int inputColumnIndex = 0; inputColumnIndex < inputColumnIndicesCount; inputColumnIndex++) {
int inputColumn = inputColumnIndices[inputColumnIndex];
input[inputColumnIndex] = entry[inputColumn];
}
}
/**
* Feeds the target output values of entry given into the target output array given.
*
* @param targetOutput
* @param entry
* @param outputColumnIndices
* @param outputColumnIndicesCount
*/
void initTargetOutput(double targetOutput[], const double entry[], const int outputColumnIndices[], int outputColumnIndicesCount) {
for (int outputColumnIndex = 0; outputColumnIndex < outputColumnIndicesCount; outputColumnIndex++) {
int outputColumn = outputColumnIndices[outputColumnIndex];
targetOutput[outputColumnIndex] = entry[outputColumn];
}
}
void test(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[], double** predictedOutputs, double costs[]) {
int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
double *entry = data->elements[entryIndex];
double input[inputNeuronCount];
double targetOutput[outputNeuronCount];
initInput(input, entry, inputColumnIndices, inputNeuronCount);
initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);
// forward propagation
propagateInputForward(model, input);
double cost = 0.0;
for (int outputIndex = 0; outputIndex < outputNeuronCount; outputIndex++) {
double value = model->values[OUTPUT_LAYER][outputIndex];
predictedOutputs[entryIndex][outputIndex] = value;
double targetValue = targetOutput[outputIndex];
cost += model->getCost(value, targetValue);
}
// Take average cost
cost /= outputNeuronCount;
costs[entryIndex] = cost;
}
}
void freeGradients(struct Model* model, double** weightGradients[], double** biasGradients) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
free(biasGradients[endLayerIndex]);
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
for (int neuronIndex = 0; neuronIndex < endNeuronCount; neuronIndex++)
free(weightGradients[endLayerIndex][neuronIndex]);
}
}
/**
* Trains the model on the given data.
*
* @param model
* @param data Container for the data the model will be trained on.
* @param inputColumnIndices The indices of the columns within {@code data} that are the input columns.
* @param outputColumnIndices The indices of the columns within {@code data} that are the output columns.
*/
void train(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[]) {
// For both weightGradients and biasGradients, index 0 is not occupied.
// [endLayerIndex][endNeuronIndex in layerIndex][startNeuronIndex in layerIndex - 1]
double** weightGradients[NUMBER_OF_LAYERS];
// [endLayerIndex][endNeuronIndex]
double* biasGradients[NUMBER_OF_LAYERS];
// Allocate the storage for the weight and bias deltas, in addition
// to initializing them all weight and bias deltas with values of 0
initGradients(model, weightGradients, biasGradients);
int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
epochIndex++;
// Feed each input into model
for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
double* entry = data->elements[entryIndex];
double input[inputNeuronCount];
double targetOutput[outputNeuronCount];
// Feed values of entry into input and targetOutput given indices of input and output columns
initInput(input, entry, inputColumnIndices, inputNeuronCount);
initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);
// forward propagation
propagateInputForward(model, input);
// update weight and bias gradients based on this entry, part of the batch
updateParameterGradients(model, targetOutput, weightGradients, biasGradients);
}
updateParameterValues(model, weightGradients, biasGradients, data->numberOfEntries);
freeGradients(model, weightGradients, biasGradients);
}
/**
* Allocates the memory for the parameters (weights and biases) of the model, in addition to initializing
* them to their default values.
*
* @param model
*/
void initParameters(struct Model* model) {
// initialize weights with arbitrary
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
model->weights[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
model->weights[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
model->biases[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] = model->getInitialWeightValue(startNeuronCount, endNeuronCount);
model->biases[endLayerIndex][endNeuronIndex] = model->getInitialBiasValue(startNeuronCount, endNeuronCount);
}
}
}
}
/**
* Allocayes the memory for the values of the model.
*
* @param model
*/
void initValues(struct Model* model) {
for (int layerIndex = 0; layerIndex < NUMBER_OF_LAYERS; layerIndex++) {
int neuronsInLayer = model->neuronsPerLayer[layerIndex];
model->values[layerIndex] = malloc(sizeof(double) * neuronsInLayer);
}
}
functions.c
#include <stdio.h>
#include <stdlib.h>
#include <zconf.h>
#include <time.h>
#include "model.h"
#include "functions.h"
#include "data.h"
#define EPOCH_COUNT 20000
#define NUMBER_OF_COLUMNS 3
#define TRAIN_ENTRIES_SIZE 4
#define TEST_ENTRIES_SIZE 4
int main() {
time_t currentTime;
time(¤tTime);
srand(currentTime);
struct Model model = {
.neuronsPerLayer = {2, 2, 1},
.learningRate = 0.02,
// Default values
.getActivation = applySigmoid,
.getActivationDerivative = applySigmoidDerivative,
.getCost = getCost,
.getCostDerivative = getCostDerivative,
.getInitialWeightValue = getInitialRandomWeight,
.getInitialBiasValue = getInitialBias,
};
int numberOfInputs = model.neuronsPerLayer[INPUT_LAYER];
int numberOfOutputs = model.neuronsPerLayer[OUTPUT_LAYER];
// Change working directory so data can be referenced relative to parent data folder
chdir("..");
struct Data trainData;
fill(&trainData, "data/xor/train.csv", NUMBER_OF_COLUMNS, TRAIN_ENTRIES_SIZE);
struct Data testData;
fill(&testData, "data/xor/test.csv", NUMBER_OF_COLUMNS, TEST_ENTRIES_SIZE);
int inputColumnIndices[numberOfInputs];
int outputColumnIndices[numberOfOutputs];
inputColumnIndices[0] = 0;
inputColumnIndices[1] = 1;
outputColumnIndices[0] = 2;
initValues(&model);
initParameters(&model);
for (int epochIndex = 0; epochIndex < EPOCH_COUNT; epochIndex++)
train(&model, &trainData, inputColumnIndices, outputColumnIndices);
exit(0);
}
如果需要其他文件,请提出要求。