Question

我现在从头开始实施BB-RBM，这样我就可以完全理解它。我正在做所有的工作，但是我正在努力从“分裂发散”（CD）转变为“持久分裂发散”（PCD）。

我了解到，当您在PCD中进行Gibbs采样时，会使用链中的上一个术语。我感到困惑的是如何在培训循环中具体实施它。现在，我在C ++中拥有的是：

// train for number of epochs
for (int epoch = 1; epoch <= nEpochs; epoch++) {

    // randomize the rows
    vector<int> rowNumbers;
    for (int rowNumber = 0; rowNumber < trainData->nRows; rowNumber++)
        rowNumbers.push_back(rowNumber);
    for (int i = 0; i < trainData->nRows; i++) {
        int index = rand() % rowNumbers.size();
        randomRows[i] = rowNumbers.at(index);
        rowNumbers.erase(rowNumbers.begin() + index);
    }
    int randRowIndex = 0;

    // loop through all batches
    cout << "batch loop ";
    for (int batchLoop = 1; batchLoop <= nBatchLoops; batchLoop++) {
        cout << batchLoop << " ";

        // Gibbs sample for a random batch of training vectors 
        for (int batchNumber = 1; batchNumber <= batchSize; batchNumber++) {

            // get a random row from the training data matrix (corresponding to a random training vector
            int row = randomRows[randRowIndex++];

            // input data into the visible layer
            for (int i = 0; i < nVis; i++)
                vis[i] = trainData->data[row * trainData->nCols + i];

            // do one Monte Carlo sampling of hidden layer
            for (int j = 0; j < nHid; j++) {

                // sum a response from bias plus weighted inputs
                double sum = hidBias[j];
                for (int i = 0; i < nVis; i++)
                    sum += weights[j * nVis + i] * vis[i];

                // input sum into the sigmoid function, to get the probability of turning this hidden node on
                double prob = 1.0 / (1.0 + exp(-1.0 * sum));

                // get a uniformly random number between [0,1]
                double ran = ((double)rand() / (RAND_MAX));

                // turn this node on or off, based on random number and probability
                if (prob >= ran)
                    hid[j] = 1.0;
                else
                    hid[j] = 0.0;

                // save probability of turning this hidden node on
                hid0Probs[j] = prob;
            }

            // now reconstruct visible layer and sample another stochastic hidden layer state for a given number of Gibbs sampling iterations
            for (int gibbs = 1; gibbs <= nGibbs; gibbs++) {

                // if using PCD, then input the pevious chain state here
                if(PCD && gibbs == 1) {
                    for (int i = 0; i < nVis; i++)
                        vis[i] = chains[row * trainData->nCols + i];
                } 
                // otherwise if we are using CD, do one Monte Carlo to reconstruct visible layer
                else {
                    for (int i = 0; i < nVis; i++) {

                        // sum a response from bias plus weighted inputs
                        double sum = visBias[i];
                        for (int j = 0; j < nHid; j++)
                            sum += weights[j * nVis + i] * hid[j];

                        // input sum into the sigmoid function, to get the probability of turning this visible node on
                        double prob = 1.0 / (1.0 + exp(-1.0 * sum));

                        // get a uniformly random number between [0,1]
                        double ran = ((double)rand() / (RAND_MAX));

                        // turn this node on or off, based on random number and probability
                        if (prob >= ran)
                            vis[i] = 1.0;
                        else
                            vis[i] = 0.0;

                        // save probability of turning the visible node on during reconstruction
                        if (gibbs == nGibbs)
                            visFProbs[i] = prob;

                        // if using PCD, save the value in the chain
                        if (PCD && gibbs == nGibbs)
                            chains[row * trainData->nCols + i] = vis[i];
                    }
                }

                // do one Monte Carlo sampling of hidden layer
                for (int j = 0; j < nHid; j++) {

                    // sum a response from bias plus weighted inputs
                    double sum = hidBias[j];
                    for (int i = 0; i < nVis; i++)
                        sum += weights[j * nVis + i] * vis[i];

                    // input sum into the sigmoid function, to get the probability of turning this hidden node on
                    double prob = 1.0 / (1.0 + exp(-1.0 * sum));

                    // get a uniformly random number between [0,1]
                    double ran = ((double)rand() / (RAND_MAX));

                    // turn this node on or off, based on random number and probability
                    if (prob >= ran)
                        hid[j] = 1.0;
                    else
                        hid[j] = 0.0;

                    // save probability of turning this hidden node on
                    if (gibbs == nGibbs)
                        hidFProbs[j] = prob;
                }
            }

            // calculate partial derivatives using Contrastive Divergence, comparing the input and initial hidden state to the final reconstruction and hidden state
            for (int i = 0; i < nVis; i++) {
                // there is alot of debate about if you should use the binary state values or probabilities of the hidden layer
                // they both work, but I used the probabilities to reduce the effect of the random on/off states

                // add the partial derivative of the energy term with respect to the visible bias term
                gVisBias[i] += (trainData->data[row * trainData->nCols + i]) - (vis[i]); // <>data - <>model

                for (int j = 0; j < nHid; j++) {
                    // add the partial derivative of the energy term with respect to the weight
                    gWeights[j * nVis + i] += (hid0Probs[j] * trainData->data[row * trainData->nCols + i]) - (hidFProbs[j] * vis[i]); // <>data - <>model 
                }
            }
            for (int j = 0; j < nHid; j++)
                // add the partial derivative of the energy term with respect to the hidden bias term
                gHidBias[j] += (hid0Probs[j]) - (hidFProbs[j]); // <>data - <>model

            // calculate training reconstruction error, to be more accurate usually testing reconstruction is calculated by using the same test data every time, what I did here is quicker but dirtier
            for (int i = 0; i < nVis; i++)
                err += pow(vis[i] - trainData->data[row * trainData->nCols + i], 2);

            // grab another random input vector for this batch, and do another batch iteration...
        }

        // only update weights and bias terms if used batchSize number of vectors in this batch, if you have even batches than this will not be a problem
        if (!unevenBatches || (unevenBatches && batchLoop != nBatchLoops)) {

            // now that Gibbs sampling is done for our batch of training vectors, we need to update weights...
            for (int i = 0; i < nVis; i++) {

                // calculate the change in visible bias term
                dVisBias[i] *= learningMomentum;
                dVisBias[i] += learningRate * gVisBias[i] / ((double)batchSize);
                dVisBias[i] -= learningRate * L1 * (visBias[i] == 0 ? 0.0 : (visBias[i] > 0 ? 1.0 : -1.0));
                dVisBias[i] -= learningRate * L2 * visBias[i];

                // update visible bias term
                visBias[i] += dVisBias[i];

                for (int j = 0; j < nHid; j++) {
                    // calculate the change in weight
                    dWeights[j * nVis + i] *= learningMomentum;
                    dWeights[j * nVis + i] += learningRate * gWeights[j * nVis + i] / ((double)batchSize);
                    dWeights[j * nVis + i] -= learningRate * L1 * (weights[j * nVis + i] == 0 ? 0.0 : (weights[j * nVis + i] > 0 ? 1.0 : -1.0));
                    dWeights[j * nVis + i] -= learningRate * L2 * weights[j * nVis + i];

                    // update weight
                    weights[j * nVis + i] += dWeights[j * nVis + i];
                }
            }
            for (int j = 0; j < nHid; j++) {

                // calculate the change in hidden bias term
                dHidBias[j] *= learningMomentum;
                dHidBias[j] += learningRate * gHidBias[j] / ((double)batchSize);
                dHidBias[j] -= learningRate * L1 * (hidBias[j] == 0 ? 0.0 : (hidBias[j] > 0 ? 1.0 : -1.0));
                dHidBias[j] -= learningRate * L2 * hidBias[j];

                // update hidden bias term
                hidBias[j] += dHidBias[j];
            }
        }

        // reset weights and bias term gradients
        for (int i = 0; i < nVis; i++) {
            gVisBias[i] = 0.0;
            for (int j = 0; j < nHid; j++)
                gWeights[j * nVis + i] = 0.0;
        }
        for (int j = 0; j < nHid; j++)
            gHidBias[j] = 0.0;

        // now grab next batch for this epoch...
    }

    // output time to finish this epoch and training reconstruction error
    cout << endl << "epoch #" << epoch << " finished in " << stopWatch.lap() << " milliseconds with a training reconstruction error of " << err << endl;

    // reset vars
    err = 0.0;

    // now go to next epoch, repeating the training process...
}

现在这是我怀疑是否正确实施PCD步骤的地方。原始论文（https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf）尚不清楚如何实现PCD，我也无法从遇到的文章，示例和视频中找到答案。我的实现方式对我来说很有意义，因为它保持了MCMC链的起点（原始数据点），通过更改正相的隐藏节点部分来调整新参数，并允许Gibbs采样进一步进行通过在先前的链值处开始Gibbs采样来计算负相位，从而比正常CD差。但是，我的实现可能绝对是错误的，因为我只是在没有老师或同伴的情况下学习这些内容。

我还出于其他两个原因对我的实现表示怀疑：1）迷你批处理，如果将纪元循环带入批处理循环，则它可能具有更多优点，因此它在移动前会对nEpochs的参数进行了完全更新到另一批。 2）hid0Probs [j]的计算，现在我将其调整为考虑每个时期的新参数，但我也可以通过以下两种方法看到优点：a）在第一个过程中将hid0Probs [j]保持为其原始值历元，因此在每个训练历元中，积极阶段都是相同的（但这不包括调整后的参数）；或b）使用上一个链中的第一个隐藏节点值（但随后它不再代表启动该链的正相）。

有人能对此有所启发吗？

基于MNIST数据的BB-RBM的持久性相异性

0 个答案: