我现在从头开始实施BB-RBM,这样我就可以完全理解它。我正在做所有的工作,但是我正在努力从“分裂发散”(CD)转变为“持久分裂发散”(PCD)。
我了解到,当您在PCD中进行Gibbs采样时,会使用链中的上一个术语。我感到困惑的是如何在培训循环中具体实施它。现在,我在C ++中拥有的是:
// train for number of epochs
for (int epoch = 1; epoch <= nEpochs; epoch++) {
// randomize the rows
vector<int> rowNumbers;
for (int rowNumber = 0; rowNumber < trainData->nRows; rowNumber++)
rowNumbers.push_back(rowNumber);
for (int i = 0; i < trainData->nRows; i++) {
int index = rand() % rowNumbers.size();
randomRows[i] = rowNumbers.at(index);
rowNumbers.erase(rowNumbers.begin() + index);
}
int randRowIndex = 0;
// loop through all batches
cout << "batch loop ";
for (int batchLoop = 1; batchLoop <= nBatchLoops; batchLoop++) {
cout << batchLoop << " ";
// Gibbs sample for a random batch of training vectors
for (int batchNumber = 1; batchNumber <= batchSize; batchNumber++) {
// get a random row from the training data matrix (corresponding to a random training vector
int row = randomRows[randRowIndex++];
// input data into the visible layer
for (int i = 0; i < nVis; i++)
vis[i] = trainData->data[row * trainData->nCols + i];
// do one Monte Carlo sampling of hidden layer
for (int j = 0; j < nHid; j++) {
// sum a response from bias plus weighted inputs
double sum = hidBias[j];
for (int i = 0; i < nVis; i++)
sum += weights[j * nVis + i] * vis[i];
// input sum into the sigmoid function, to get the probability of turning this hidden node on
double prob = 1.0 / (1.0 + exp(-1.0 * sum));
// get a uniformly random number between [0,1]
double ran = ((double)rand() / (RAND_MAX));
// turn this node on or off, based on random number and probability
if (prob >= ran)
hid[j] = 1.0;
else
hid[j] = 0.0;
// save probability of turning this hidden node on
hid0Probs[j] = prob;
}
// now reconstruct visible layer and sample another stochastic hidden layer state for a given number of Gibbs sampling iterations
for (int gibbs = 1; gibbs <= nGibbs; gibbs++) {
// if using PCD, then input the pevious chain state here
if(PCD && gibbs == 1) {
for (int i = 0; i < nVis; i++)
vis[i] = chains[row * trainData->nCols + i];
}
// otherwise if we are using CD, do one Monte Carlo to reconstruct visible layer
else {
for (int i = 0; i < nVis; i++) {
// sum a response from bias plus weighted inputs
double sum = visBias[i];
for (int j = 0; j < nHid; j++)
sum += weights[j * nVis + i] * hid[j];
// input sum into the sigmoid function, to get the probability of turning this visible node on
double prob = 1.0 / (1.0 + exp(-1.0 * sum));
// get a uniformly random number between [0,1]
double ran = ((double)rand() / (RAND_MAX));
// turn this node on or off, based on random number and probability
if (prob >= ran)
vis[i] = 1.0;
else
vis[i] = 0.0;
// save probability of turning the visible node on during reconstruction
if (gibbs == nGibbs)
visFProbs[i] = prob;
// if using PCD, save the value in the chain
if (PCD && gibbs == nGibbs)
chains[row * trainData->nCols + i] = vis[i];
}
}
// do one Monte Carlo sampling of hidden layer
for (int j = 0; j < nHid; j++) {
// sum a response from bias plus weighted inputs
double sum = hidBias[j];
for (int i = 0; i < nVis; i++)
sum += weights[j * nVis + i] * vis[i];
// input sum into the sigmoid function, to get the probability of turning this hidden node on
double prob = 1.0 / (1.0 + exp(-1.0 * sum));
// get a uniformly random number between [0,1]
double ran = ((double)rand() / (RAND_MAX));
// turn this node on or off, based on random number and probability
if (prob >= ran)
hid[j] = 1.0;
else
hid[j] = 0.0;
// save probability of turning this hidden node on
if (gibbs == nGibbs)
hidFProbs[j] = prob;
}
}
// calculate partial derivatives using Contrastive Divergence, comparing the input and initial hidden state to the final reconstruction and hidden state
for (int i = 0; i < nVis; i++) {
// there is alot of debate about if you should use the binary state values or probabilities of the hidden layer
// they both work, but I used the probabilities to reduce the effect of the random on/off states
// add the partial derivative of the energy term with respect to the visible bias term
gVisBias[i] += (trainData->data[row * trainData->nCols + i]) - (vis[i]); // <>data - <>model
for (int j = 0; j < nHid; j++) {
// add the partial derivative of the energy term with respect to the weight
gWeights[j * nVis + i] += (hid0Probs[j] * trainData->data[row * trainData->nCols + i]) - (hidFProbs[j] * vis[i]); // <>data - <>model
}
}
for (int j = 0; j < nHid; j++)
// add the partial derivative of the energy term with respect to the hidden bias term
gHidBias[j] += (hid0Probs[j]) - (hidFProbs[j]); // <>data - <>model
// calculate training reconstruction error, to be more accurate usually testing reconstruction is calculated by using the same test data every time, what I did here is quicker but dirtier
for (int i = 0; i < nVis; i++)
err += pow(vis[i] - trainData->data[row * trainData->nCols + i], 2);
// grab another random input vector for this batch, and do another batch iteration...
}
// only update weights and bias terms if used batchSize number of vectors in this batch, if you have even batches than this will not be a problem
if (!unevenBatches || (unevenBatches && batchLoop != nBatchLoops)) {
// now that Gibbs sampling is done for our batch of training vectors, we need to update weights...
for (int i = 0; i < nVis; i++) {
// calculate the change in visible bias term
dVisBias[i] *= learningMomentum;
dVisBias[i] += learningRate * gVisBias[i] / ((double)batchSize);
dVisBias[i] -= learningRate * L1 * (visBias[i] == 0 ? 0.0 : (visBias[i] > 0 ? 1.0 : -1.0));
dVisBias[i] -= learningRate * L2 * visBias[i];
// update visible bias term
visBias[i] += dVisBias[i];
for (int j = 0; j < nHid; j++) {
// calculate the change in weight
dWeights[j * nVis + i] *= learningMomentum;
dWeights[j * nVis + i] += learningRate * gWeights[j * nVis + i] / ((double)batchSize);
dWeights[j * nVis + i] -= learningRate * L1 * (weights[j * nVis + i] == 0 ? 0.0 : (weights[j * nVis + i] > 0 ? 1.0 : -1.0));
dWeights[j * nVis + i] -= learningRate * L2 * weights[j * nVis + i];
// update weight
weights[j * nVis + i] += dWeights[j * nVis + i];
}
}
for (int j = 0; j < nHid; j++) {
// calculate the change in hidden bias term
dHidBias[j] *= learningMomentum;
dHidBias[j] += learningRate * gHidBias[j] / ((double)batchSize);
dHidBias[j] -= learningRate * L1 * (hidBias[j] == 0 ? 0.0 : (hidBias[j] > 0 ? 1.0 : -1.0));
dHidBias[j] -= learningRate * L2 * hidBias[j];
// update hidden bias term
hidBias[j] += dHidBias[j];
}
}
// reset weights and bias term gradients
for (int i = 0; i < nVis; i++) {
gVisBias[i] = 0.0;
for (int j = 0; j < nHid; j++)
gWeights[j * nVis + i] = 0.0;
}
for (int j = 0; j < nHid; j++)
gHidBias[j] = 0.0;
// now grab next batch for this epoch...
}
// output time to finish this epoch and training reconstruction error
cout << endl << "epoch #" << epoch << " finished in " << stopWatch.lap() << " milliseconds with a training reconstruction error of " << err << endl;
// reset vars
err = 0.0;
// now go to next epoch, repeating the training process...
}
现在这是我怀疑是否正确实施PCD步骤的地方。 原始论文(https://www.cs.toronto.edu/~tijmen/pcd/pcd.pdf)尚不清楚如何实现PCD,我也无法从遇到的文章,示例和视频中找到答案。我的实现方式对我来说很有意义,因为它保持了MCMC链的起点(原始数据点),通过更改正相的隐藏节点部分来调整新参数,并允许Gibbs采样进一步进行通过在先前的链值处开始Gibbs采样来计算负相位,从而比正常CD差。但是,我的实现可能绝对是错误的,因为我只是在没有老师或同伴的情况下学习这些内容。
我还出于其他两个原因对我的实现表示怀疑:1)迷你批处理,如果将纪元循环带入批处理循环,则它可能具有更多优点,因此它在移动前会对nEpochs的参数进行了完全更新到另一批。 2)hid0Probs [j]的计算,现在我将其调整为考虑每个时期的新参数,但我也可以通过以下两种方法看到优点:a)在第一个过程中将hid0Probs [j]保持为其原始值历元,因此在每个训练历元中,积极阶段都是相同的(但这不包括调整后的参数);或b)使用上一个链中的第一个隐藏节点值(但随后它不再代表启动该链的正相)。
有人能对此有所启发吗?