我编码神经网络,我在理解和编码反向传播方面遇到了麻烦,但它没有正确学习。我不知道我的反向传播功能在哪里出问题。
这是我的错误函数L = plogq + (1−p)log(1−q)
我的激活功能是f(x)=1/(1+e^(-x))
我正在尝试简单的问题x > y
,它并不真正需要多层感知器。
#include <bits/stdc++.h>
using namespace std;
const double ETA=0.001;
const double EPS=0.000001;
const bool OUTPUT=true;
typedef function<double(vector<double>, vector<double>)> func;
double scalar_product(vector<double> a, vector<double> b)
{
assert(a.size() == b.size());
double ret = 0.0;
for (int i=0; i<a.size(); i++)
{
ret += a[i] * b[i];
}
return ret;
}
class Perceptron
{
public:
int n;
double val,der;
double delta;
vector<int> next;
vector<double> w;
function<double(vector<double>, vector<double>)> h;
function<double(vector<double>, vector<double>)> d;
Perceptron(int n,func h,func d):n(n),h(h),d(d)
{
for(int i=0; i<n+1; i++)
w.push_back(1.0*rand()/RAND_MAX);
}
double calc(vector<double> x)
{
val=h(w,x);
der=d(w,x);
return val;
}
};
class NeuralNetwork
{
public:
int inputLayer,hiddenLayer,outputLayer;
vector<Perceptron> inputNeurons;
vector<Perceptron> hiddenNeurons;
vector<Perceptron> outputNeurons;
NeuralNetwork(int in,int hid,int out)
{
inputLayer=in;
hiddenLayer=hid;
outputLayer=out;
auto logistic = [] (vector<double> w, vector<double> x) -> double
{
x.push_back(1.0);
return 1.0 / (1.0 + exp(-scalar_product(w, x)));
};
auto d_logistic = [logistic] (vector<double> w, vector<double> x) -> double
{
double lst = logistic(w, x);
return lst * (1.0 - lst);
};
auto input = [] (vector<double> w, vector<double> x) -> double
{
return x[0];
};
auto d_input = [] (vector<double> w, vector<double> x) -> double
{
return 1.0;
};
auto ident = [] (vector<double> w, vector<double> x) -> double
{
x.push_back(1.0);
return scalar_product(w, x);
};
auto d_ident = [] (vector<double> w, vector<double> x) -> double
{
return 1.0;
};
for(int i=0; i<inputLayer; i++)
inputNeurons.push_back(Perceptron(1,input,d_input));
if(OUTPUT)cout<<"Created "<<inputLayer<<" input neurons."<<endl;
for(int i=0; i<hiddenLayer; i++)
hiddenNeurons.push_back(Perceptron(inputLayer,logistic,d_logistic));
if(OUTPUT)cout<<"Created "<<hiddenLayer<<" hidden neurons."<<endl;
for(int i=0; i<outputLayer; i++)
outputNeurons.push_back(Perceptron(hiddenLayer,logistic,d_logistic));
if(OUTPUT)cout<<"Created "<<outputLayer<<" output neurons."<<endl;
}
vector<double> feedForward(vector<double> x)
{
for(int i=0; i<inputLayer; i++)
inputNeurons[i].calc({x[i]});
vector<double> inputLayerOut;
for(int i=0; i<inputLayer; i++)
inputLayerOut.push_back(inputNeurons[i].val);
for(int i=0; i<hiddenLayer; i++)
hiddenNeurons[i].calc(inputLayerOut);
vector<double> hiddenLayerOut;
for(int i=0; i<hiddenLayer; i++)
hiddenLayerOut.push_back(hiddenNeurons[i].val);
for(int i=0; i<outputLayer; i++)
outputNeurons[i].calc(hiddenLayerOut);
vector<double> outputLayerOut;
for(int i=0; i<outputLayer; i++)
outputLayerOut.push_back(outputNeurons[i].val);
return outputLayerOut;
}
void backPropagation(vector<vector<double> > x, vector<vector<double> > y, int max_steps)
{
double diff;
do
{
diff=0.0;
for(int t=0; t<x.size(); t++)
{
vector<double> out=feedForward(x[t]);
for(int i=0; i<outputLayer; i++)
outputNeurons[i].delta=(y[t][i]-outputNeurons[i].val);
for(int i=0; i<hiddenLayer; i++)
{
double sum=0.0;
for(int j=0; j<outputLayer; j++)
sum+=outputNeurons[j].delta*hiddenNeurons[i].w[j];
hiddenNeurons[i].delta=hiddenNeurons[i].der*sum;
}
}
for(int i=0; i<outputLayer; i++)
{
for (int j=0; j<outputNeurons[i].w.size(); j++)
{
double z = (j < outputNeurons[i].n) ? hiddenNeurons[j].val : 1.0;
double curr = ETA * outputNeurons[i].delta * z;
outputNeurons[i].w[j] += curr;
diff += curr * curr;
}
}
for(int i=0; i<hiddenLayer; i++)
{
for (int j=0; j<hiddenNeurons[i].w.size(); j++)
{
double z = (j < hiddenNeurons[i].n) ? inputNeurons[j].val : 1.0;
double curr = ETA * hiddenNeurons[i].delta * z;
hiddenNeurons[i].w[j] += curr;
diff += curr * curr;
}
}
if(OUTPUT&&max_steps%100==0)cout<<"Current diff: "<<diff<<endl;
}
while(diff>EPS&&max_steps--);
}
};
int main()
{
//srand(time(NULL));
NeuralNetwork nn=NeuralNetwork(2,5,1);
vector<vector<double> > trainingInput;
vector<vector<double> > trainingOutput;
trainingInput.resize(100);
trainingOutput.resize(100);
for(int i=0; i<100; i++)
{
int x=rand()%100;
int y=rand()%100;
trainingInput[i].push_back(x);
trainingInput[i].push_back(y);
trainingOutput[i].push_back(x>y?1:0);
}
nn.backPropagation(trainingInput,trainingOutput,10000);
while(true)
{
int x,y;
cin>>x>>y;
cout<<nn.feedForward({x,y})[0]<<endl;
}
return 0;
}
答案 0 :(得分:1)
对于反向传播代码,关键是正确计算梯度。您可以通过计算数值近似并比较它来检查渐变。渐变的数值近似仅为(F(x; w + dw) - F(x; w))/dw
,其中x
是输入向量(可能是任何东西,对此目的无关紧要),w
是权重之一或偏见,dw
是一个小数字(例如1e-4)。对于每个权重和偏差,计算该差异,然后将近似向量与您从代码中获得的任何内容进行比较。它们应该大致相同,并且在dw
变小时应该更接近。