我目前正在通过Andrew Ng教授关于Coursera的在线课程学习BPNN。我想我已经理解了这种方法,并尝试使用C ++和Armadillo(线性代数库)来实现它。
我正在安德鲁的幻灯片中使用的东西,但总体而言代码不能正常运行,任何能够找出这段代码有什么问题的人呢?
安德鲁的幻灯片在这里:Slide_8和slide_9,有关计算a的内容在讲座8中,其他内容如费用函数J(theta)和dJ(theta)在讲座9中。这是我的代码。
#define MAX_ITER 500
double lrate = 0.1;
double lambda = 0.0;
int numLayers;
int numHiddenLayerNode;
int numOutputNodes;
int numHiddenLayers = 1;
colvec vec2colvec(vector<double>& vec){
int length = vec.size();
colvec A(length);
for(int i=0; i<length; i++){
A(i) = vec[i];
}
return A;
}
rowvec vec2rowvec(vector<double>& vec){
colvec A = vec2colvec(vec);
return A.t();
}
mat vec2mat(vector<vector<double> >&vec){
int cols = vec.size();
int rows = vec[0].size();
mat A(rows, cols);
for(int i = 0; i<rows; i++){
for(int j=0; j<cols; j++){
A(i, j) = vec[j][i];
}
}
return A;
}
colvec log(colvec vec){
for(int i = 0; i < vec.size(); i++){
vec(i) = log(vec(i));
}
return vec;
}
rowvec log(rowvec vec){
for(int i = 0; i < vec.size(); i++){
vec(i) = log(vec(i));
}
return vec;
}
double sigmoid(double z){
return 1 / (exp(-z) + 1);
}
rowvec sigmoid(rowvec z){
for(int i=0; i<z.size(); i++){
z(i) = sigmoid(z(i));
}
return z;
}
colvec sigmoid(colvec z){
rowvec temp = z.t();
return (sigmoid(temp)).t();
}
double dsigmoid(double z){
return z * (1 - z);
}
colvec dsigmoid(colvec a){
colvec one = ones<colvec>(a.size());
return a % (one - a);
}
rowvec dsigmoid(rowvec a){
colvec temp = a.t();
return (dsigmoid(temp)).t();
}
vector<colvec> getA(mat x, vector<mat>& weightsMatrix, int m){
vector<colvec> a;
colvec temp1(x.n_rows);
a.push_back(temp1);
for(int i=0; i<numHiddenLayers; i++){
colvec temp(numHiddenLayerNode);
a.push_back(temp);
}
colvec temp2(numOutputNodes);
a.push_back(temp2);
colvec one = ones<colvec>(1);
for(int i = 0; i < a.size(); i++){
if(i == 0) a[i] = x.col(m);
else{
colvec xtemp = a[i - 1];
xtemp = join_cols(one, xtemp);
a[i] = weightsMatrix[i - 1] * xtemp;
a[i] = sigmoid(a[i]);
}
}
return a;
}
//h(xi) is just last vector of a
colvec gethm(vector<colvec> a){
return a[a.size() - 1];
}
colvec getCostFunction(mat x, vector<mat>& weightsMatrix, mat y, double lambda){
int nsamples = x.n_cols;
colvec sum = zeros<colvec>(y.n_rows);
for(int m = 0; m < nsamples; m++){
vector<colvec> a = getA(x, weightsMatrix, m);
colvec hx = gethm(a);
colvec one = ones<colvec>(hx.size());
for(int k = 0; k < y.n_rows; k++){
if(y(k, m) == 0.0){
sum = sum + log(one - hx);
}else{
sum = sum + log(hx);
}
}
}
sum = sum / (double)(- nsamples);
double temp = lambda / 2 / nsamples;
double sum2 = 0.0;
for(int i = 0; i < weightsMatrix.size() - 1; i++){
for(int j = 0; j < weightsMatrix[i].n_cols; j++){
for(int k = 0; k < weightsMatrix[i].n_rows; k++){
sum2 += weightsMatrix[i](k, j) * weightsMatrix[i](k, j);
}
}
}
return sum + temp * sum2;
}
vector<mat> getdJ(mat x, mat y, vector<mat>& weightsMatrix, double lambda){
//big delta is temp variables for calculating dJ
//let every variables in bigDelta to be zero.
vector<mat> bigDelta;
for(int i=0; i<weightsMatrix.size(); i++){
mat temp = zeros<mat>(weightsMatrix[i].n_rows, weightsMatrix[i].n_cols);
bigDelta.push_back(temp);
}
vector<mat> dJ;
for(int i=0; i<weightsMatrix.size(); i++){
mat temp = zeros<mat>(weightsMatrix[i].n_rows, weightsMatrix[i].n_cols);
dJ.push_back(temp);
}
int nsamples = x.n_cols;
//use backProp method
for(int m = 0; m < nsamples; m++){
vector<colvec> a = getA(x, weightsMatrix, m);
vector<colvec> tempDelta;
for(int i=0; i<a.size(); i++){
colvec temp = zeros<colvec>(a[i].size());
tempDelta.push_back(temp);
}
//no tempDelta[0]
for(int l = tempDelta.size() - 1; l > 0; l --){
if(l == tempDelta.size() - 1){
tempDelta[l] = a[l] - y.col(m);
}else{
mat mult = (weightsMatrix[l]).t() * tempDelta[l + 1];
tempDelta[l] = mult.rows(1, mult.n_rows - 1) % dsigmoid(a[l]);
}
}
for(int l = 0; l < bigDelta.size(); l++){
colvec tp = ones<colvec>(1);
tp = join_cols(tp, a[l]);
bigDelta[l] += tempDelta[l + 1] * tp.t();
}
}
for(int l = 0; l < bigDelta.size(); l++){
dJ[l] = bigDelta[l] / (double)nsamples;
dJ[l] = dJ[l] + lambda * weightsMatrix[l];
for(int j = 0; j < dJ[l].n_rows; j++){
dJ[l](j, 0) = dJ[l](j, 0) - lambda * weightsMatrix[l](j, 0);
}
}
return dJ;
}
colvec calculateY(colvec x, vector<mat> weightsMatrix){
colvec result(x);
colvec tp = ones<colvec>(1);
for(int i=0; i<weightsMatrix.size(); i++){
result = join_cols(tp, result);
result = weightsMatrix[i] * result;
}
return result;
}
void bpnn(vector<vector<double> >&vecX, vector<vector<double> >&vecY, vector<vector<double> >& testX, vector<vector<double> >& testY){
int nsamples = vecX.size();
int nfeatures = vecX[0].size();
//change vecX and vecY into matrix or vector.
mat y = vec2mat(vecY);
mat x = vec2mat(vecX);
numLayers = numHiddenLayers + 1;
numHiddenLayerNode = nfeatures * 5;
numOutputNodes = vecY[0].size();
//build weights matrices and randomly initialize them.
vector<mat> weightsMatrix;
mat tempmat;
double init_epsilon = 0.12;
//input --> first hidden layer:
tempmat = randu<mat>(numHiddenLayerNode, nfeatures + 1);
weightsMatrix.push_back(tempmat);
//hidden layer --> hidden layer :
for(int i=0; i< numHiddenLayers - 1; i++){
tempmat = randu<mat>(numHiddenLayerNode, numHiddenLayerNode + 1);
weightsMatrix.push_back(tempmat);
}
//last hidden layer --> output layer:
tempmat = randu<mat>(numOutputNodes, numHiddenLayerNode + 1);
weightsMatrix.push_back(tempmat);
for(int i=0; i<weightsMatrix.size(); i++){
weightsMatrix[i] = weightsMatrix[i] * (2 * init_epsilon) - init_epsilon;
}
//till now, we finished building weights matrices.
/*
//Gradient Checking (this checking gives right answers)
vector<mat> dJ = getdJ(x, y, weightsMatrix, lambda);
cout<<"test!!!!"<<endl;
double step = 1e-4;
for(int i=0; i<weightsMatrix.size(); i++){
cout<<"################ Weight Layer "<<i<<endl;
for(int j=0; j<weightsMatrix[i].n_rows; j++){
for(int k=0; k<weightsMatrix[i].n_cols; k++){
double memo = weightsMatrix[i](j, k);
weightsMatrix[i](j, k) = memo + step;
colvec value1 = getCostFunction(x, weightsMatrix, y, 0);
weightsMatrix[i](j, k) = memo - step;
colvec value2 = getCostFunction(x, weightsMatrix, y, 0);
colvec tp = (value1 - value2) / (2 * step);
cout<<tp(0)<<", "<<dJ[i](j, k)<<endl;
weightsMatrix[i](j, k) = memo;
}
}
}
*/
int converge = 0;
while(converge < MAX_ITER){
vector<mat> dJ = getdJ(x, y, weightsMatrix, lambda);
for(int j = 0; j < weightsMatrix.size(); j++){
weightsMatrix[j] -= lrate * dJ[j];
}
colvec cost = getCostFunction(x, weightsMatrix, y, lambda);
double costdouble = cost(0);
cout<<"learning step: "<<converge<<", Cost function value = "<<costdouble<<endl;
++ converge;
}
cout<<"############result#############"<<endl;
for(int i=0; i<testX.size(); i++){
colvec tpcol = vec2colvec(testX[i]);
colvec result = calculateY(tpcol, weightsMatrix);
result = sigmoid(result);
cout<<result<<endl;
}
}
我尝试了Gradient检查方法,这给出了正确的答案,所以我认为J(theta)部分是正确的。
因为我使用1 /(exp(-z)+1)Sigmoid函数,所以输出应该是> = 0.5值或<0.5值,但是现在值都大于0.5,我真的困惑。
(很明显,犰狳中的'%'表示'。*',而我使用的其他犰狳函数非常清晰,与matlab或八度音阶相似)
感谢。