反向传播神经网络没有正确学习

时间:2016-06-20 21:14:13

标签: c++ machine-learning backpropagation

我编码神经网络,我在理解和编码反向传播方面遇到了麻烦,但它没有正确学习。我不知道我的反向传播功能在哪里出问题。

这是我的错误函数L = plogq + (1−p)log(1−q)

我的激活功能是f(x)=1/(1+e^(-x))

我正在尝试简单的问题x > y,它并不真正需要多层感知器。

#include <bits/stdc++.h>

using namespace std;
const double ETA=0.001;
const double EPS=0.000001;
const bool OUTPUT=true;
typedef function<double(vector<double>, vector<double>)> func;
double scalar_product(vector<double> a, vector<double> b)
{
    assert(a.size() == b.size());
    double ret = 0.0;
    for (int i=0; i<a.size(); i++)
    {
        ret += a[i] * b[i];
    }
    return ret;
}
class Perceptron
{
public:
    int n;
    double val,der;
    double delta;
    vector<int> next;
    vector<double> w;
    function<double(vector<double>, vector<double>)> h;
    function<double(vector<double>, vector<double>)> d;

    Perceptron(int n,func h,func d):n(n),h(h),d(d)
    {
        for(int i=0; i<n+1; i++)
            w.push_back(1.0*rand()/RAND_MAX);
    }
    double calc(vector<double> x)
    {
        val=h(w,x);
        der=d(w,x);
        return val;
    }
};
class NeuralNetwork
{
public:
    int inputLayer,hiddenLayer,outputLayer;
    vector<Perceptron> inputNeurons;
    vector<Perceptron> hiddenNeurons;
    vector<Perceptron> outputNeurons;
    NeuralNetwork(int in,int hid,int out)
    {
        inputLayer=in;
        hiddenLayer=hid;
        outputLayer=out;

        auto logistic = [] (vector<double> w, vector<double> x) -> double
        {
            x.push_back(1.0);
            return 1.0 / (1.0 + exp(-scalar_product(w, x)));
        };
        auto d_logistic = [logistic] (vector<double> w, vector<double> x) -> double
        {
            double lst = logistic(w, x);
            return lst * (1.0 - lst);
        };

        auto input = [] (vector<double> w, vector<double> x) -> double
        {
            return x[0];
        };
        auto d_input = [] (vector<double> w, vector<double> x) -> double
        {
            return 1.0;
        };

        auto ident = [] (vector<double> w, vector<double> x) -> double
        {
            x.push_back(1.0);
            return scalar_product(w, x);
        };
        auto d_ident = [] (vector<double> w, vector<double> x) -> double
        {
            return 1.0;
        };

        for(int i=0; i<inputLayer; i++)
            inputNeurons.push_back(Perceptron(1,input,d_input));
        if(OUTPUT)cout<<"Created "<<inputLayer<<" input neurons."<<endl;
        for(int i=0; i<hiddenLayer; i++)
            hiddenNeurons.push_back(Perceptron(inputLayer,logistic,d_logistic));
        if(OUTPUT)cout<<"Created "<<hiddenLayer<<" hidden neurons."<<endl;
        for(int i=0; i<outputLayer; i++)
            outputNeurons.push_back(Perceptron(hiddenLayer,logistic,d_logistic));
        if(OUTPUT)cout<<"Created "<<outputLayer<<" output neurons."<<endl;

    }
    vector<double> feedForward(vector<double> x)
    {
        for(int i=0; i<inputLayer; i++)
            inputNeurons[i].calc({x[i]});
        vector<double> inputLayerOut;
        for(int i=0; i<inputLayer; i++)
            inputLayerOut.push_back(inputNeurons[i].val);

        for(int i=0; i<hiddenLayer; i++)
            hiddenNeurons[i].calc(inputLayerOut);
        vector<double> hiddenLayerOut;
        for(int i=0; i<hiddenLayer; i++)
            hiddenLayerOut.push_back(hiddenNeurons[i].val);

        for(int i=0; i<outputLayer; i++)
            outputNeurons[i].calc(hiddenLayerOut);
        vector<double> outputLayerOut;
        for(int i=0; i<outputLayer; i++)
            outputLayerOut.push_back(outputNeurons[i].val);
        return outputLayerOut;
    }
    void backPropagation(vector<vector<double> > x, vector<vector<double> > y, int max_steps)
    {
        double diff;
        do
        {
            diff=0.0;
            for(int t=0; t<x.size(); t++)
            {
                vector<double> out=feedForward(x[t]);
                for(int i=0; i<outputLayer; i++)
                    outputNeurons[i].delta=(y[t][i]-outputNeurons[i].val);
                for(int i=0; i<hiddenLayer; i++)
                {
                    double sum=0.0;
                    for(int j=0; j<outputLayer; j++)
                        sum+=outputNeurons[j].delta*hiddenNeurons[i].w[j];
                    hiddenNeurons[i].delta=hiddenNeurons[i].der*sum;
                }
            }
            for(int i=0; i<outputLayer; i++)
            {
                for (int j=0; j<outputNeurons[i].w.size(); j++)
                {
                    double z = (j < outputNeurons[i].n) ? hiddenNeurons[j].val : 1.0;
                    double curr = ETA * outputNeurons[i].delta * z;
                    outputNeurons[i].w[j] += curr;
                    diff += curr * curr;
                }
            }
            for(int i=0; i<hiddenLayer; i++)
            {
                for (int j=0; j<hiddenNeurons[i].w.size(); j++)
                {
                    double z = (j < hiddenNeurons[i].n) ? inputNeurons[j].val : 1.0;
                    double curr = ETA * hiddenNeurons[i].delta * z;
                    hiddenNeurons[i].w[j] += curr;
                    diff += curr * curr;
                }
            }
            if(OUTPUT&&max_steps%100==0)cout<<"Current diff: "<<diff<<endl;
        }
        while(diff>EPS&&max_steps--);
    }

};

int main()
{
    //srand(time(NULL));
    NeuralNetwork nn=NeuralNetwork(2,5,1);
    vector<vector<double> > trainingInput;
    vector<vector<double> > trainingOutput;
    trainingInput.resize(100);
    trainingOutput.resize(100);
    for(int i=0; i<100; i++)
    {
        int x=rand()%100;
        int y=rand()%100;
        trainingInput[i].push_back(x);
        trainingInput[i].push_back(y);
        trainingOutput[i].push_back(x>y?1:0);
    }
    nn.backPropagation(trainingInput,trainingOutput,10000);
    while(true)
    {
        int x,y;
        cin>>x>>y;
        cout<<nn.feedForward({x,y})[0]<<endl;
    }
    return 0;
}

1 个答案:

答案 0 :(得分:1)

对于反向传播代码,关键是正确计算梯度。您可以通过计算数值近似并比较它来检查渐变。渐变的数值近似仅为(F(x; w + dw) - F(x; w))/dw,其中x是输入向量(可能是任何东西,对此目的无关紧要),w是权重之一或偏见,dw是一个小数字(例如1e-4)。对于每个权重和偏差,计算该差异,然后将近似向量与您从代码中获得的任何内容进行比较。它们应该大致相同,并且在dw变小时应该更接近。