Question

我目前正在通过Andrew Ng教授关于Coursera的在线课程学习BPNN。我想我已经理解了这种方法，并尝试使用C ++和Armadillo（线性代数库）来实现它。

我正在安德鲁的幻灯片中使用的东西，但总体而言代码不能正常运行，任何能够找出这段代码有什么问题的人呢？

安德鲁的幻灯片在这里：Slide_8和slide_9，有关计算a的内容在讲座8中，其他内容如费用函数J（theta）和dJ（theta）在讲座9中。这是我的代码。

#define MAX_ITER 500

double lrate = 0.1;
double lambda = 0.0;

int numLayers;
int numHiddenLayerNode;
int numOutputNodes;
int numHiddenLayers = 1;

colvec vec2colvec(vector<double>& vec){
    int length = vec.size();
    colvec A(length);
    for(int i=0; i<length; i++){
        A(i) = vec[i];
    }
    return A;
}

rowvec vec2rowvec(vector<double>& vec){
    colvec A = vec2colvec(vec);
    return A.t();
}

mat vec2mat(vector<vector<double> >&vec){
    int cols = vec.size();
    int rows = vec[0].size();
    mat A(rows, cols);
    for(int i = 0; i<rows; i++){
        for(int j=0; j<cols; j++){
            A(i, j) = vec[j][i];
        }
    }
    return A;
}

colvec log(colvec vec){
    for(int i = 0; i < vec.size(); i++){
        vec(i) = log(vec(i));
    }
    return vec;
}

rowvec log(rowvec vec){
    for(int i = 0; i < vec.size(); i++){
        vec(i) = log(vec(i));
    }
    return vec;
}

double sigmoid(double z){
    return 1 / (exp(-z) + 1);
}

rowvec sigmoid(rowvec z){
    for(int i=0; i<z.size(); i++){
        z(i) = sigmoid(z(i));
    }
    return z;
}

colvec sigmoid(colvec z){
    rowvec temp = z.t();
    return (sigmoid(temp)).t();
}

double dsigmoid(double z){
    return z * (1 - z);
}

colvec dsigmoid(colvec a){
    colvec one = ones<colvec>(a.size());
    return a % (one - a);
}

rowvec dsigmoid(rowvec a){
    colvec temp = a.t();
    return (dsigmoid(temp)).t();
}

vector<colvec> getA(mat x, vector<mat>& weightsMatrix, int m){

    vector<colvec> a;
    colvec temp1(x.n_rows);
    a.push_back(temp1);
    for(int i=0; i<numHiddenLayers; i++){
        colvec temp(numHiddenLayerNode);
        a.push_back(temp);
    }
    colvec temp2(numOutputNodes);
    a.push_back(temp2);
    colvec one = ones<colvec>(1);
    for(int i = 0; i < a.size(); i++){
        if(i == 0) a[i] = x.col(m);
        else{
            colvec xtemp = a[i - 1];
            xtemp =  join_cols(one, xtemp);
            a[i] = weightsMatrix[i - 1] * xtemp;
            a[i] = sigmoid(a[i]);
        }
    }    
    return a;    
}

//h(xi) is just last vector of a
colvec gethm(vector<colvec> a){
    return a[a.size() - 1];
}

colvec getCostFunction(mat x, vector<mat>& weightsMatrix, mat y, double lambda){

    int nsamples = x.n_cols;
    colvec sum = zeros<colvec>(y.n_rows);
    for(int m = 0; m < nsamples; m++){
        vector<colvec> a = getA(x, weightsMatrix, m);
        colvec hx = gethm(a);
        colvec one = ones<colvec>(hx.size());
        for(int k = 0; k < y.n_rows; k++){
            if(y(k, m) == 0.0){
                sum = sum + log(one - hx);
            }else{
                sum = sum + log(hx);
            }
        }
    }
    sum = sum / (double)(- nsamples);
    double temp = lambda / 2 / nsamples;
    double sum2 = 0.0;
    for(int i = 0; i < weightsMatrix.size() - 1; i++){
        for(int j = 0; j < weightsMatrix[i].n_cols; j++){
            for(int k = 0; k < weightsMatrix[i].n_rows; k++){
                sum2 += weightsMatrix[i](k, j) * weightsMatrix[i](k, j);
            }
        }
    }
    return sum + temp * sum2;
}

vector<mat> getdJ(mat x, mat y, vector<mat>& weightsMatrix, double lambda){
    //big delta is temp variables for calculating dJ
    //let every variables in bigDelta to be zero.
    vector<mat> bigDelta;
    for(int i=0; i<weightsMatrix.size(); i++){
        mat temp = zeros<mat>(weightsMatrix[i].n_rows, weightsMatrix[i].n_cols);
        bigDelta.push_back(temp);
    }   
    vector<mat> dJ;
    for(int i=0; i<weightsMatrix.size(); i++){
        mat temp = zeros<mat>(weightsMatrix[i].n_rows, weightsMatrix[i].n_cols);
        dJ.push_back(temp);
    }
    int nsamples = x.n_cols;
    //use backProp method
    for(int m = 0; m < nsamples; m++){

        vector<colvec> a = getA(x, weightsMatrix, m);
        vector<colvec> tempDelta;
        for(int i=0; i<a.size(); i++){
            colvec temp = zeros<colvec>(a[i].size());
            tempDelta.push_back(temp);
        }
        //no tempDelta[0]
        for(int l = tempDelta.size() - 1; l > 0; l --){
            if(l == tempDelta.size() - 1){
                tempDelta[l] = a[l] - y.col(m);
            }else{
                mat mult = (weightsMatrix[l]).t() * tempDelta[l + 1];
                tempDelta[l] = mult.rows(1, mult.n_rows - 1) % dsigmoid(a[l]);
            }
        }
        for(int l = 0; l < bigDelta.size(); l++){
            colvec tp = ones<colvec>(1);
            tp =  join_cols(tp, a[l]);
            bigDelta[l] += tempDelta[l + 1] * tp.t();    
        }
    }
    for(int l = 0; l < bigDelta.size(); l++){
        dJ[l] = bigDelta[l] / (double)nsamples;
        dJ[l] = dJ[l] + lambda * weightsMatrix[l];
        for(int j = 0; j < dJ[l].n_rows; j++){
            dJ[l](j, 0) = dJ[l](j, 0) - lambda * weightsMatrix[l](j, 0);
        }            
    }
    return dJ;
}

colvec calculateY(colvec x, vector<mat> weightsMatrix){
    colvec result(x);
    colvec tp = ones<colvec>(1);
    for(int i=0; i<weightsMatrix.size(); i++){
        result = join_cols(tp, result);
        result = weightsMatrix[i] * result;
    }
    return result;
}

void bpnn(vector<vector<double> >&vecX, vector<vector<double> >&vecY, vector<vector<double> >& testX, vector<vector<double> >& testY){

    int nsamples = vecX.size();
    int nfeatures = vecX[0].size();
    //change vecX and vecY into matrix or vector.
    mat y = vec2mat(vecY);
    mat x = vec2mat(vecX);

    numLayers = numHiddenLayers + 1;
    numHiddenLayerNode = nfeatures * 5;
    numOutputNodes = vecY[0].size();
    //build weights matrices and randomly initialize them.
    vector<mat> weightsMatrix;
    mat tempmat;
    double init_epsilon = 0.12;
    //input --> first hidden layer:
    tempmat = randu<mat>(numHiddenLayerNode, nfeatures + 1);
    weightsMatrix.push_back(tempmat);
    //hidden layer --> hidden layer :
    for(int i=0; i< numHiddenLayers - 1; i++){
        tempmat = randu<mat>(numHiddenLayerNode, numHiddenLayerNode + 1);
        weightsMatrix.push_back(tempmat);
    }
    //last hidden layer --> output layer:
    tempmat = randu<mat>(numOutputNodes, numHiddenLayerNode + 1);
    weightsMatrix.push_back(tempmat);
    for(int i=0; i<weightsMatrix.size(); i++){
        weightsMatrix[i] = weightsMatrix[i] * (2 * init_epsilon) - init_epsilon;
    }
    //till now, we finished building weights matrices.

 /*
    //Gradient Checking (this checking gives right answers)
    vector<mat> dJ = getdJ(x, y, weightsMatrix, lambda);
    cout<<"test!!!!"<<endl;
    double step = 1e-4;
    for(int i=0; i<weightsMatrix.size(); i++){
        cout<<"################ Weight Layer "<<i<<endl;
        for(int j=0; j<weightsMatrix[i].n_rows; j++){
            for(int k=0; k<weightsMatrix[i].n_cols; k++){
                double memo = weightsMatrix[i](j, k);
                weightsMatrix[i](j, k) = memo + step;
                colvec value1 = getCostFunction(x, weightsMatrix, y, 0);
                weightsMatrix[i](j, k) = memo - step;
                colvec value2 = getCostFunction(x, weightsMatrix, y, 0);
                colvec tp = (value1 - value2) / (2 * step);
                cout<<tp(0)<<", "<<dJ[i](j, k)<<endl;
                weightsMatrix[i](j, k) = memo;
            }
        }
    }
    */

    int converge = 0;
    while(converge < MAX_ITER){
        vector<mat> dJ = getdJ(x, y, weightsMatrix, lambda);
        for(int j = 0; j < weightsMatrix.size(); j++){
            weightsMatrix[j] -= lrate * dJ[j];
        }
        colvec cost = getCostFunction(x, weightsMatrix, y, lambda);
        double costdouble = cost(0);
        cout<<"learning step: "<<converge<<", Cost function value = "<<costdouble<<endl;
        ++ converge;
    }
    cout<<"############result#############"<<endl;
    for(int i=0; i<testX.size(); i++){
        colvec tpcol = vec2colvec(testX[i]);
        colvec result = calculateY(tpcol, weightsMatrix);
        result = sigmoid(result);
        cout<<result<<endl;
    }

}

我尝试了Gradient检查方法，这给出了正确的答案，所以我认为J（theta）部分是正确的。

因为我使用1 /（exp（-z）+1）Sigmoid函数，所以输出应该是> = 0.5值或<0.5值，但是现在值都大于0.5，我真的困惑。

（很明显，犰狳中的'％'表示'。*'，而我使用的其他犰狳函数非常清晰，与matlab或八度音阶相似）

感谢。

用C ++实现的反向传播神经网络

0 个答案: