XOR神经网络收敛到0.5

时间:2018-05-25 04:53:30

标签: java machine-learning neural-network xor

我似乎无法找到我的神经网络有什么问题,尽管根据this example验证了我的网,这表明我的后备和前进道具工作正常。但是,在对XOR进行训练之后,无论输入如何,我的网络输出都会返回0.5左右。换句话说,网络似乎最大限度地减少了误差,而没有看到输入和输出之间的任何相关性。由于反向传播的单次迭代似乎工作正常,我的直觉会暗示问题在于后面的迭代。但是,没有任何明显的问题会导致这种情况,让我感到非常难过。

我已经看过出现类似问题的其他线程了,但似乎大多数时候他们的错误要么对他们设置网络的方式非常小,要么他们的参数如学习率或时代真的是关闭的。有人熟悉这样的案例吗?

public class Net
{
int[] sizes;
double LEARNING_RATE;

double[][][] weights;
double[][] bias;

Random rand = new Random();  //53489085

public Net(int[] sizes_, double LEARNING_RATE_)
{
    LEARNING_RATE = LEARNING_RATE_;
    sizes = sizes_;

    int numInputs = sizes[0];
    double range = 1.0 / Math.sqrt(numInputs);

    bias = new double[sizes.length - 1][];
    weights = new double[sizes.length - 1][][];

    for(int w_layer = 0; w_layer < weights.length; w_layer++)
    {
        bias[w_layer] = new double[sizes[w_layer+1]];
        weights[w_layer] = new double[sizes[w_layer+1]][sizes[w_layer]];
        for(int j = 0; j < weights[w_layer].length; j++)
        {
            bias[w_layer][j] = 2*range*rand.nextDouble() - range;
            for(int i = 0; i < weights[w_layer][0].length; i++)
            {
                weights[w_layer][j][i] = 2*range*rand.nextDouble() - range;
            }
        }
    }
}

public double[] evaluate(double[] image_vector)
{
    return forwardPass(image_vector)[sizes.length-1];
}

public double totalError(double[][] expec, double[][] actual)
{
    double sum = 0;
    for(int i = 0; i < expec.length; i++)
    {
        sum += error(expec[i], evaluate(actual[i]));
    }
    return sum / expec.length;
}

private double error(double[] expec, double[] actual)
{
    double sum = 0;
    for(int i = 0; i < expec.length; i++)
    {
        double del = expec[i] - actual[i];
        sum += 0.5 * del * del;
    }
    return sum;
}

public void backpropagate(double[][] image_vector, double[][] outputs)
{
    double[][][] deltaWeights = new double[weights.length][][];
    double[][] deltaBias = new double[weights.length][];

    for(int w = 0; w < weights.length; w++)
    {
        deltaBias[w] = new double[bias[w].length];
        deltaWeights[w] = new double[weights[w].length][];
        for(int j = 0; j < weights[w].length; j++)
        {
            deltaWeights[w][j] = new double[weights[w][j].length];
        }
    }

    for(int batch = 0; batch < image_vector.length; batch++)
    {
        double[][] neuronVals = forwardPass(image_vector[batch]);

        /* OUTPUT DELTAS */
        int w_layer = weights.length-1;

        double[] deltas = new double[weights[w_layer].length];

        for(int j = 0; j < weights[w_layer].length; j++)
        {
            double actual = neuronVals[w_layer + 1][j]; 
            double expec = outputs[batch][j];

            double deltaErr = actual - expec;
            double deltaSig = actual * (1 - actual);

            double delta = deltaErr * deltaSig;
            deltas[j] = delta;

            deltaBias[w_layer][j] += delta;
            for(int i = 0; i < weights[w_layer][0].length; i++)
            {
                deltaWeights[w_layer][j][i] += delta * neuronVals[w_layer][i];
            }
        }

        w_layer--;
        /* REST OF THE DELTAS */
        while(w_layer >= 0)
        {   

            double[] nextDeltas = new double[weights[w_layer].length];
            for(int j = 0; j < weights[w_layer].length; j++)
            {
                double outNeur = neuronVals[w_layer+1][j];
                double deltaSig = outNeur * (1 - outNeur);

                double sum = 0;
                for(int i = 0; i < weights[w_layer+1].length; i++)
                {
                    sum += weights[w_layer+1][i][j] * deltas[i];
                }

                double delta = sum * deltaSig;
                nextDeltas[j] = delta;

                deltaBias[w_layer][j] += delta;
                for(int i = 0; i < weights[w_layer][0].length; i++)
                {
                    deltaWeights[w_layer][j][i] += delta * neuronVals[w_layer][i];
                }
            }
            deltas = nextDeltas;

            w_layer--;
        }
    }

    for(int w_layer = 0; w_layer < weights.length; w_layer++)
    {
        for(int j = 0; j < weights[w_layer].length; j++)
        {

            deltaBias[w_layer][j] /= (double) image_vector.length;

            bias[w_layer][j] -= LEARNING_RATE * deltaBias[w_layer][j];

            for(int i = 0; i < weights[w_layer][j].length; i++)
            {   
                deltaWeights[w_layer][j][i] /= (double) image_vector.length; // average of batches
                weights[w_layer][j][i] -= LEARNING_RATE * deltaWeights[w_layer][j][i];
            }
        }
    }
}

public double[][] forwardPass(double[] image_vector)
{
    double[][] outputs = new double[sizes.length][];

    double[] inputs = image_vector;

    for(int w = 0; w < weights.length; w++)
    {
        outputs[w] = inputs;

        double[] output = new double[weights[w].length];
        for(int j = 0; j < weights[w].length; j++)
        {
            output[j] = bias[w][j];
            for(int i = 0; i < weights[w][j].length; i++)
            {
                output[j] += weights[w][j][i] * inputs[i];
            }
            output[j] = sigmoid(output[j]);
        }
        inputs = output;
    }

    outputs[outputs.length-1] = inputs.clone();

    return outputs;
}

static public double sigmoid(double val)
{
    return 1.0 / (1.0 + Math.exp(-val));
}
}

我的XOR课程看起来像这样。考虑到它的简单性,这个错误不太可能出现在这个部分,但我认为如果我对XOR如何工作有一些基本的误解,那么发布也不会有什么坏处。我的网络设置为批量采集示例,但正如您在下面看到的这个特定示例,我发送它的批次,或者实际上不使用批次。

public class SingleLayer {

static int numEpochs = 10000;
static double LEARNING_RATE = 0.001;
static int[] sizes = new int[] {2, 2, 1};

public static void main(String[] args)
{

    System.out.println("Initializing randomly generate neural net...");
    Net n = new Net(sizes, LEARNING_RATE);
    System.out.println("Complete!");

    System.out.println("Loading dataset...");

    double[][] inputs = new double[4][2];
    double[][] outputs = new double[4][1];

    inputs[0] = new double[] {1, 1};
    outputs[0] = new double[] {0};

    inputs[1] = new double[] {1, 0};
    outputs[1] = new double[] {1};

    inputs[2] = new double[] {0, 1};
    outputs[2] = new double[] {1};

    inputs[3] = new double[] {0, 0};
    outputs[3] = new double[] {0};

    System.out.println("Complete!");

    System.out.println("STARTING ERROR: " + n.totalError(outputs, inputs));
    for(int epoch = 0; epoch < numEpochs; epoch++)
    {
        double[][] in = new double[1][2];
        double[][] out = new double[1][1];
        int num = (int)(Math.random()*inputs.length);

        in[0] = inputs[num];
        out[0] = outputs[num];

        n.backpropagate(inputs, outputs);
        System.out.println("ERROR: " + n.totalError(out, in));
    }

    System.out.println("Prediction After Training: " + n.evaluate(inputs[0])[0] + "  Expected: " + outputs[0][0]);
    System.out.println("Prediction After Training: " + n.evaluate(inputs[1])[0] + "  Expected: " + outputs[1][0]);
    System.out.println("Prediction After Training: " + n.evaluate(inputs[2])[0] + "  Expected: " + outputs[2][0]);
    System.out.println("Prediction After Training: " + n.evaluate(inputs[3])[0] + "  Expected: " + outputs[3][0]);
}
}

任何人都可以提供一些有关错误的见解吗?我的参数定义得很好,我已经遵循了关于如何初始化权重以及学习率应该是多少的所有建议等等。谢谢!

2 个答案:

答案 0 :(得分:0)

您只是将前3个输入呈现给您的神经网络,因为以下行是错误的:

int num = (int)(Math.random() * 3);

将其更改为

int num = (int)(Math.random() * inputs.length);

使用所有4种可能的输入。

答案 1 :(得分:0)

我明白了。我没有运行足够的时代。这对我来说似乎有些愚蠢,但是this visualization向我透露,在将错误减少到0.00001之前,网络在时间内依赖于答案~0.5。