我最近一直基于http://neuralnetworksanddeeplearning.com/实现NN。我已经完成了关于反向传播和SGD的整个算法,几乎与本书作者所采用的方法相同。问题是,尽管他具有相同的hiperparameters,但在一个历元之后他获得90%左右的准确性时,在5个历元之后我却获得了30%的准确性。您知道可能是什么原因吗? 这是我的资料库。 https://github.com/PiPower/Deep-Neural-Network 这是Network.cpp中实现的反向传播算法和SGD算法的一部分:
void Network::Train(MatrixD_Array& TrainingData, MatrixD_Array& TrainingLabels, int BatchSize,int epochs, double LearningRate)
{
assert(TrainingData.size() == TrainingLabels.size() && CostFunc != nullptr && CostFuncDer != nullptr && LearningRate > 0);
std::vector<long unsigned int > indexes;
for (int i = 0; i < TrainingData.size(); i++) indexes.push_back(i);
std::random_device rd;
std::mt19937 g(rd());
std::vector<Matrix<double>> NablaWeights;
std::vector<Matrix<double>> NablaBiases;
NablaWeights.resize(Layers.size());
NablaBiases.resize(Layers.size());
for (int i = 0; i < Layers.size(); i++)
{
NablaWeights[i] = Matrix<double>(Layers[i].GetInDim(), Layers[i].GetOutDim());
NablaBiases[i] = Matrix<double>(1, Layers[i].GetOutDim());
}
//---- Epoch iterating
for (int i = 0; i < epochs; i++)
{
cout << "Epoch number: " << i << endl;
shuffle(indexes.begin(), indexes.end(), g);
// Batch iterating
for (int batch = 0; batch < TrainingData.size(); batch = batch + BatchSize)
{
for (int i = 0; i < Layers.size(); i++)
{
NablaWeights[i].Clear();
NablaBiases[i].Clear();
}
int i = 0;
while( i < BatchSize && (i+batch)< TrainingData.size())
{
std::vector<Matrix<double>> ActivationOutput;
std::vector<Matrix<double>> Z_Output;
ActivationOutput.resize(Layers.size() + 1);
Z_Output.resize(Layers.size());
ActivationOutput[0] = TrainingData[indexes[i + batch]];
int index = 0;
// Pushing values through
for (auto layer : Layers)
{
Z_Output[index] = layer.Mul(ActivationOutput[index]);
ActivationOutput[index + 1] = layer.ApplyActivation(Z_Output[index]);
index++;
}
// ---- Calculating Nabla that will be later devided by batch size element wise
auto DeltaNabla = BackPropagation(ActivationOutput, Z_Output, TrainingLabels[indexes[i + batch]]);
for (int i = 0; i < Layers.size(); i++)
{
NablaWeights[i] = NablaWeights[i] + DeltaNabla.first[i];
NablaBiases[i] = NablaBiases[i] + DeltaNabla.second[i];
}
i++;
}
for (int g = 0; g < Layers.size(); g++)
{
Layers[g].Weights = Layers[g].Weights - NablaWeights[g] * LearningRate;
Layers[g].Biases = Layers[g].Biases - NablaBiases[g] * LearningRate;
}
// std::transform(NablaWeights.begin(), NablaWeights.end(), NablaWeights.begin(),[BatchSize, LearningRate](Matrix<double>& Weight) {return Weight * (LearningRate / BatchSize);});
//std::transform(NablaBiases.begin(), NablaBiases.end(), NablaBiases.begin(), [BatchSize, LearningRate](Matrix<double>& Bias) {return Bias * (LearningRate / BatchSize); });
}
}
}
std::pair<MatrixD_Array, MatrixD_Array> Network::BackPropagation( MatrixD_Array& ActivationOutput, MatrixD_Array& Z_Output,Matrix<double>& label)
{
MatrixD_Array NablaWeight;
MatrixD_Array NablaBias;
NablaWeight.resize(Layers.size());
NablaBias.resize(Layers.size());
auto zs = Layers[Layers.size() - 1].ActivationPrime(Z_Output[Z_Output.size() - 1]);
Matrix<double> Delta_L = Hadamard(CostFuncDer(ActivationOutput[ActivationOutput.size() - 1],label), zs);
NablaWeight[Layers.size() - 1] = Delta_L * ActivationOutput[ActivationOutput.size() - 2].Transpose();
NablaBias[Layers.size() - 1] = Delta_L;
for (int j = 2; j <= Layers.size() ; j++)
{
auto sp = Layers[Layers.size() - j].ActivationPrime(Z_Output[Layers.size() -j]);
Delta_L = Hadamard(Layers[Layers.size() - j+1 ].Weights.Transpose() * Delta_L, sp);
NablaWeight[Layers.size() - j] = Delta_L * ActivationOutput[ActivationOutput.size() -j-1].Transpose();
NablaBias[Layers.size() - j] = Delta_L;
}
return make_pair(NablaWeight, NablaBias);
}
答案 0 :(得分:0)
事实证明,mnist加载程序无法正常工作。