我试图解决XOR问题,但是输出始终收敛到0.5,所以我尝试了一个更简单的问题,例如NOT,并且发生了同样的事情。
我真的不知道发生了什么,我检查了百万次代码,一切似乎都正确,当我调试它保存神经网络信息时,我发现权重值或偏差值都在增加真的很大。为此,我还遵循了有关神经网络和其他一些视频的3蓝色1棕色youtube系列视频。
这是我的代码:
PS:我将整个代码放在这里,但我认为主要问题是在bakpropag函数内部
class NeuralNetwork {
int inNum, hiddenLayersNum, outNum, netSize;
int[] hiddenLayerSize;
Matrix[] weights;
Matrix[] biases;
Matrix[] sums;
Matrix[] activations;
Matrix[] error;
Matrix inputs;
long samples = 0;
float learningRate;
//Constructor------------------------------------------------------------------------------------------------------
NeuralNetwork(int inNum, int hiddenLayersNum, int[] hiddenLayerSize, int outNum, float learningRate) {
this.inNum = inNum;
this.hiddenLayersNum = hiddenLayersNum;
this.hiddenLayerSize = hiddenLayerSize;
this.outNum = outNum;
this.netSize = hiddenLayersNum + 1;
this.learningRate = learningRate;
//output layer plus the hidden layer size
//Note: I'm not adding the input layer because it doesn't have weights
weights = new Matrix[netSize];
//no biases added to the output layer
biases = new Matrix[netSize - 1];
sums = new Matrix[netSize];
activations = new Matrix[netSize];
error = new Matrix[netSize];
initializeHiddenLayer();
initializeOutputLayer();
}
//Initializing Algorithms------------------------------------------------------------------------------------------
void initializeHiddenLayer() {
for (int i = 0; i < hiddenLayersNum; i++) {
if (i == 0) {//only the first hidden layer takes the inputs
weights[i] = new Matrix(hiddenLayerSize[i], inNum);
} else {
weights[i] = new Matrix(hiddenLayerSize[i], hiddenLayerSize[i - 1]);
}
biases[i] = new Matrix(hiddenLayerSize[i], 1);
sums[i] = new Matrix(hiddenLayerSize[i], 1);
activations[i] = new Matrix(hiddenLayerSize[i], 1);
error[i] = new Matrix(hiddenLayerSize[i], 1);
}
}
void initializeOutputLayer() {
//the output layer takes the last hidden layer activation values
weights[netSize - 1] = new Matrix(outNum, hiddenLayerSize[hiddenLayerSize.length - 1]);
activations[netSize - 1] = new Matrix(outNum, 1);
sums[netSize - 1] = new Matrix(outNum, 1);
error[netSize - 1] = new Matrix(outNum, 1);
for (Matrix m : weights) {
for (int i = 0; i < m.i; i++) {
for (int j = 0; j < m.j; j++) {
m.values[i][j] = random(-1, 1);
}
}
}
for (Matrix m : biases) {
for (int i = 0; i < m.i; i++) {
for (int j = 0; j < m.j; j++) {
m.values[i][j] = 1;
}
}
}
for (Matrix m : sums) {
for (int i = 0; i < m.i; i++) {
for (int j = 0; j < m.j; j++) {
m.values[i][j] = 0;
}
}
}
}
//Calculation------------------------------------------------------------------------------------------------------
void calculate(float[] inputs) {
this.inputs = new Matrix(0, 0);
this.inputs = this.inputs.arrayToCollumn(inputs);
sums[0] = (weights[0].matrixMult(this.inputs)).sum(biases[0]);
activations[0] = sigM(sums[0]);
for (int i = 1; i < netSize - 1; i++) {
sums[i] = weights[i].matrixMult(activations[i - 1]);
activations[i] = sigM(sums[i]).sum(biases[i]);
}
//there's no biases in the output layer
//And the output layer uses sigmoid function
sums[netSize - 1] = weights[netSize - 1].matrixMult(activations[netSize - 1 - 1]);
activations[netSize - 1] = sigM(sums[netSize - 1]);
}
//Sending outputs--------------------------------------------------------------------------------------------------
Matrix getOuts() {
return activations[netSize - 1];
}
//Backpropagation--------------------------------------------------------------------------------------------------
void calcError(float[] exp) {
Matrix expected = new Matrix(0, 0);
expected = expected.arrayToCollumn(exp);
//E = (output - expected)
error[netSize - 1] = this.getOuts().diff(expected);
samples++;
}
void backPropag(int layer) {
if (layer == netSize - 1) {
error[layer].scalarDiv(samples);
for (int i = layer - 1; i >= 0; i--) {
prevLayerCost(i);
}
weightError(layer);
backPropag(layer - 1);
} else {
weightError(layer);
biasError(layer);
if (layer != 0)
backPropag(layer - 1);
}
}
void weightError(int layer) {
if (layer != 0) {
for (int i = 0; i < weights[layer].i; i++) {
for (int j = 0; j < weights[layer].j; j++) {
float changeWeight = 0;
if (layer != netSize - 1)
changeWeight = activations[layer - 1].values[j][0] * deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
else
changeWeight = activations[layer - 1].values[j][0] * deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
weights[layer].values[i][j] += -learningRate * changeWeight;
}
}
} else {
for (int i = 0; i < weights[layer].i; i++) {
for (int j = 0; j < weights[layer].j; j++) {
float changeWeight = this.inputs.values[j][0] * deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
weights[layer].values[i][j] += -learningRate * changeWeight;
}
}
}
}
void biasError(int layer) {
for (int i = 0; i < biases[layer].i; i++) {
for (int j = 0; j < biases[layer].j; j++) {
float changeBias = 0;
if (layer != netSize - 1)
changeBias = deriSig(sums[layer].values[i][0]) * error[layer].values[i][0];
biases[layer].values[i][j] += -learningRate * changeBias;
}
}
}
void prevLayerCost(int layer) {
for (int i = 0; i < activations[layer].i; i++) {
for (int j = 0; j < activations[layer + 1].j; j++) {//for all conections of that neuron to the next layer
if (layer != netSize - 1)
error[layer].values[i][0] += weights[layer + 1].values[j][i] * deriSig(sums[layer + 1].values[j][0]) * error[layer + 1].values[j][0];
else
error[layer].values[i][0] += weights[layer + 1].values[j][i] * deriSig(sums[layer + 1].values[j][0]) * error[layer + 1].values[j][0];
}
}
}
//Activation Functions---------------------------------------------------------------------------------------------
Matrix reLUM(Matrix m) {
Matrix temp = m.copyM();
for (int i = 0; i < temp.i; i++) {
for (int j = 0; j < temp.j; j++) {
temp.values[i][j] = ReLU(m.values[i][j]);
}
}
return temp;
}
float ReLU(float x) {
return max(0, x);
}
float deriReLU(float x) {
if (x <= 0)
return 0;
else
return 1;
}
Matrix sigM(Matrix m) {
Matrix temp = m.copyM();
for (int i = 0; i < temp.i; i++) {
for (int j = 0; j < temp.j; j++) {
temp.values[i][j] = sig(m.values[i][j]);
}
}
return temp;
}
float sig(float x) {
return 1 / (1 + exp(-x));
}
float deriSig(float x) {
return sig(x) * (1 - sig(x));
}
//Saving Files-----------------------------------------------------------------------------------------------------
void SaveNeuNet() {
for (int i = 0; i < weights.length; i++) {
weights[i].saveM("weights\\weightLayer" + i);
}
for (int i = 0; i < biases.length; i++) {
biases[i].saveM("biases\\biasLayer" + i);
}
for (int i = 0; i < activations.length; i++) {
activations[i].saveM("activations\\activationLayer" + i);
}
for (int i = 0; i < error.length; i++) {
error[i].saveM("errors\\errorLayer" + i);
}
}
}
这是矩阵代码:
class Matrix {
int i, j, size;
float[][] values;
Matrix(int i, int j) {
this.i = i;
this.j = j;
this.size = i * j;
values = new float[i][j];
}
Matrix sum (Matrix other) {
if (other.i == this.i && other.j == this.j) {
for (int x = 0; x < this.i; x++) {
for (int z = 0; z < this.j; z++) {
values[x][z] += other.values[x][z];
}
}
return this;
}
return null;
}
Matrix diff(Matrix other) {
if (other.i == this.i && other.j == this.j) {
for (int x = 0; x < this.i; x++) {
for (int z = 0; z < this.j; z++) {
values[x][z] -= other.values[x][z];
}
}
return this;
}
return null;
}
Matrix scalarMult(float k) {
for (int i = 0; i < this.i; i++) {
for (int j = 0; j < this.j; j++) {
values[i][j] *= k;
}
}
return this;
}
Matrix scalarDiv(float k) {
if (k != 0) {
for (int i = 0; i < this.i; i++) {
for (int j = 0; j < this.j; j++) {
values[i][j] /= k;
}
}
return this;
} else
return null;
}
Matrix matrixMult(Matrix other) {
if (this.j != other.i)
return null;
else {
Matrix temp = new Matrix(this.i, other.j);
for (int i = 0; i < temp.i; i++) {
for (int j = 0; j < temp.j; j++) {
for (int k = 0; k < this.j; k++) {
temp.values[i][j] += this.values[i][k] * other.values[k][j];
}
}
}
return temp;
}
}
Matrix squaredValues(){
for (int i = 0; i < this.i; i++){
for (int j = 0; j < this.j; j++){
values[i][j] = sq(values[i][j]);
}
}
return this;
}
void printM() {
for (int x = 0; x < this.i; x++) {
print("| ");
for (int z = 0; z < this.j; z++) {
print(values[x][z] + " | ");
}
println();
}
}
void saveM(String name) {
String out = "";
for (int x = 0; x < this.i; x++) {
out += "| ";
for (int z = 0; z < this.j; z++) {
out += values[x][z] + " | ";
}
out += "\n";
}
saveStrings("outputs\\" + name + ".txt", new String[] {out});
}
Matrix arrayToCollumn(float[] array) {
Matrix temp = new Matrix(array.length, 1);
for (int i = 0; i < array.length; i++)
temp.values[i][0] = array[i];
return temp;
}
Matrix arrayToLine(float[] array) {
Matrix temp = new Matrix(1, array.length);
for (int j = 0; j < array.length; j++)
temp.values[0][j] = array[j];
return temp;
}
Matrix copyM(){
Matrix temp = new Matrix(i, j);
for (int i = 0; i < this.i; i++){
for (int j = 0; j < this.j; j++){
temp.values[i][j] = this.values[i][j];
}
}
return temp;
}
}
正如我所说,输出总是收敛到0.5,而不是实际值1或0
答案 0 :(得分:1)
我重写了代码,它现在可以正常工作!我不知道之前的代码出了什么问题,但这是可行的:
class NeuralNetwork {
int netSize;
float learningRate;
Matrix[] weights;
Matrix[] biases;
Matrix[] activations;
Matrix[] sums;
Matrix[] errors;
NeuralNetwork(int inNum, int hiddenNum, int[] hiddenLayerSize, int outNum, float learningRate) {
netSize = hiddenNum + 1;
this.learningRate = learningRate;
weights = new Matrix[netSize];
biases = new Matrix[netSize - 1];
activations = new Matrix[netSize];
sums = new Matrix[netSize];
errors = new Matrix[netSize];
initializeMatrices(inNum, hiddenNum, hiddenLayerSize, outNum);
}
//INITIALIZING MATRICES
void initializeMatrices(int inNum, int hiddenNum, int[] layerSize, int outNum) {
for (int i = 0; i < hiddenNum; i++) {
if (i == 0)
weights[i] = new Matrix(layerSize[0], inNum);
else
weights[i] = new Matrix(layerSize[i], layerSize[i - 1]);
biases[i] = new Matrix(layerSize[i], 1);
activations[i] = new Matrix(layerSize[i], 1);
errors[i] = new Matrix(layerSize[i], 1);
sums[i] = new Matrix(layerSize[i], 1);
weights[i].randomize(-1, 1);
biases[i].randomize(-1, 1);
activations[i].randomize(-1, 1);
}
weights[netSize - 1] = new Matrix(outNum, layerSize[layerSize.length - 1]);
activations[netSize - 1] = new Matrix(outNum, 1);
errors[netSize - 1] = new Matrix(outNum, 1);
sums[netSize - 1] = new Matrix(outNum, 1);
weights[netSize - 1].randomize(-1, 1);
activations[netSize - 1].randomize(-1, 1);
}
//---------------------------------------------------------------------------------------------------------------
void forwardPropag(float[] ins) {
Matrix inputs = new Matrix(0, 0);
inputs = inputs.arrayToCollumn(ins);
sums[0] = (weights[0].matrixMult(inputs)).sum(biases[0]);
activations[0] = sigM(sums[0]);
for (int i = 1; i < netSize - 1; i++) {
sums[i] = (weights[i].matrixMult(activations[i - 1])).sum(biases[i]);
activations[i] = sigM(sums[i]);
}
//output layer does not have biases
sums[netSize - 1] = weights[netSize - 1].matrixMult(activations[netSize - 2]);
activations[netSize - 1] = sigM(sums[netSize - 1]);
}
Matrix predict(float[] inputs) {
forwardPropag(inputs);
return activations[netSize - 1].copyM();
}
//SUPERVISED LEARNING - BACKPROPAGATION
void train(float[] inps, float[] expec) {
Matrix expected = new Matrix(0, 0);
expected = expected.arrayToCollumn(expec);
errors[netSize - 1] = predict(inps).diff(expected);
calcErorrPrevLayers();
adjustWeights(inps);
adjustBiases();
for (Matrix m : errors){
m.reset();
}
}
void calcErorrPrevLayers() {
for (int l = netSize - 2; l >= 0; l--) {
for (int i = 0; i < activations[l].i; i++) {
for (int j = 0; j < activations[l + 1].i; j++) {
errors[l].values[i][0] += weights[l + 1].values[j][i] * dSig(sums[l + 1].values[j][0]) * errors[l + 1].values[j][0];
}
}
}
}
void adjustWeights(float[] inputs) {
for (int l = 0; l < netSize; l++) {
if (l == 0) {
//for ervery neuron n in the first layer
for (int n = 0; n < activations[l].i; n++) {
//for every weight w of the first layer
for (int w = 0; w < inputs.length; w++) {
float weightChange = inputs[w] * dSig(sums[l].values[n][0]) * errors[l].values[n][0];
weights[l].values[n][w] += -learningRate * weightChange;
}
}
} else {
//for ervery neuron n in the first layer
for (int n = 0; n < activations[l].i; n++) {
//for every weight w of the first layer
for (int w = 0; w < activations[l - 1].i; w++) {
float weightChange = activations[l - 1].values[w][0] * dSig(sums[l].values[n][0]) * errors[l].values[n][0];
weights[l].values[n][w] += -learningRate * weightChange;
}
}
}
}
}
void adjustBiases() {
for (int l = 0; l < netSize - 1; l++) {
//for ervery neuron n in the first layer
for (int n = 0; n < activations[l].i; n++) {
float biasChange = dSig(sums[l].values[n][0]) * errors[l].values[n][0];
biases[l].values[n][0] += -learningRate * biasChange;
}
}
}
//ACTIVATION FUNCTION
float sig(float x) {
return 1 / (1 + exp(-x));
}
float dSig(float x) {
return sig(x) * (1 - sig(x));
}
Matrix sigM(Matrix m) {
Matrix temp = m.copyM();
for (int i = 0; i < m.i; i++) {
for (int j = 0; j < m.j; j++) {
temp.values[i][j] = sig(m.values[i][j]);
}
}
return temp;
}
}