我已经编写了一些相当标准的CNTK代码,以使用C ++ API训练简单的前馈网络。在训练过程中,使用GPU超过CPU可使GPU的速度提高约4倍(CPU被100%使用),但是在使用GPU的训练过程中,GPU的利用率几乎为零。使用CNTK C ++ API,如何提高GPU利用率?我的结果很好,但看似太慢了。
我正在最近的Lenovo,Windows 10,Intel CPU,Quadro P3200上运行CNTK。 我正在将CNTK应用于程序中即时生成的内存数据。 具有2个标量特征和1个标量标签数据(例如10,000个项目)的简单数据集。数据实际上是从Monte Carlo模拟生成的。 由于没有用于内存数据的C ++阅读器API,因此我使用Value :: CreateSequence API创建要传递给培训师的数据。请注意,数据是在卡上创建的,只读的(假设“设备”参数为GPU(0)),以减少主机/卡的传输。数据结构最初是C ++本征矩阵。我在代码示例中使用Nesterov学习器,但尝试了SGD和SGD并获得了类似的结果。所有计算都使用float,在Quadro上应该很快。
void RMLearning::OLSDeepLearning(const MatrixXd& X, const MatrixXd& Y, const DeviceDescriptor& device)
{
auto inputVarName = L"features";
auto inputVar = InputVariable({ m_inputDim }, DataType::Float, inputVarName);
auto regressionOutput = FullyConnectedFeedForwardRegressionNet(inputVar, m_outputDim, m_hiddenLayersDim, m_numHiddenLayers, device, m_nonLinearity, L"regressionOutput");
auto labelsVarName = L"Labels";
auto labelsVar = InputVariable({ m_outputDim }, DataType::Float, labelsVarName);
auto trainingLoss = ReduceSum(CNTK::SquaredError(regressionOutput, labelsVar, L"SquaredErrorLossFunction"), Axis::AllAxes(), L"SquaredErrorLossFunction");
if (m_SaveAndReLoadModel)
SaveReloadESModel(regressionOutput, trainingLoss, inputVar, labelsVar, device, inputVarName, labelsVarName);
m_prediction = regressionOutput;
ProgressWriterPtr pw = MakeSharedObject<MyProgressWriter>(0, 0, 0, 0, 0, 0);
// Nesterov learner (SGD with momentum)
m_learner = MomentumSGDLearner(regressionOutput->Parameters(), m_learningRate, m_Momentum, true);
m_trainer = CreateTrainer(regressionOutput, trainingLoss, m_prediction, { m_learner }, { pw });
size_t numSamples = X.rows();
m_inputData.resize(m_inputDim * numSamples);
m_labelData.resize(m_outputDim * numSamples, 0);
m_olsFittedValues.resize(numSamples);
flattenXYData(numSamples, X, Y);
NDShape inputShape({ m_inputDim });
auto dim = inputShape.Dimensions();
ValuePtr inputValue = Value::CreateSequence(inputShape, m_inputData, device,true);
NDShape labelShape({ m_outputDim });
ValuePtr labelValue = Value::CreateSequence(labelShape, m_labelData, device,true);
ValuePtr outputValue, predictionErrorValue;
//main training loop
//at the moment CNTK C++ API lacks "reader" functions for taking data from memory.
//the reader support the automatic minibatch extraction from full batch.
//Here we are training full batch.
if (m_printTrainingProgress) std::cout << "OLS learning..." << std::endl;
for (size_t i = 0; i < m_iterationCount; ++i)
{
m_trainer->TrainMinibatch({ {inputVar, inputValue}, {labelsVar, labelValue} }, device);
if (m_printTrainingProgress) m_trainer->SummarizeTrainingProgress();
}
//record fitted training weights
getTrainingWeightsVaR();
//get fitted values and VaR weights
EvaluationSequenceUsingDense(m_trainer->EvaluationFunction(), m_inputData, m_olsFittedValues, false, device);
if (m_printTrainingSummary)
{
std::unordered_map<Variable, ValuePtr> tmap = { {inputVar, inputValue} };
m_trainer->TestMinibatch(tmap, device, false);
m_trainer->SummarizeTestProgress();
}
}
void RMLearning::flattenXYData(const size_t &numSamples, const Eigen::MatrixXd & X, const Eigen::MatrixXd & Y)
{
//map X matrix to input data vector
size_t j = 0;
for (size_t i1 = 0; i1 < numSamples; ++i1)
{
for (size_t i2 = 0; i2 < m_inputDim; ++i2)
{
m_inputData[j] = (float)X(i1, i2);
j++;
}
}
//map Y matrix to label data vector
j = 0;
for (size_t i1 = 0; i1 < numSamples; ++i1)
{
for (size_t i2 = 0; i2 < m_outputDim; ++i2)
{
m_labelData[j] = (float)Y(i1, i2);
j++;
}
}
}
//Neural network structure setup
inline FunctionPtr RMLearning::FullyConnectedFeedForwardRegressionNet(Variable input,
size_t outputLayerDim,
size_t hiddenLayerDim,
size_t numHiddenLayers,
const DeviceDescriptor& device,
const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity,
const std::wstring& outputName,
unsigned long seed)
{
assert(numHiddenLayers >= 1);
auto regressionRoot = FullyConnectedDNNLayer(input, hiddenLayerDim, device, nonLinearity, L"", seed);
for (size_t i = 1; i < numHiddenLayers; ++i)
regressionRoot = FullyConnectedDNNLayer(regressionRoot, hiddenLayerDim, device, nonLinearity, L"", seed);
//output layer
regressionRoot = FullyConnectedLinearLayer(regressionRoot, outputLayerDim, device, outputName, seed);
return regressionRoot;
}
在NN为3个暗淡的20个隐藏层的情况下,使用GPU(利用率几乎为零)在1,000次迭代中训练全部批次的20,000条记录大约需要2秒,而使用CPU大约需要8秒(使用6核心Intel 100%使用率)。鉴于CNTK可以使开发人员免受CUDA的大多数低级细节的影响,是否提供了可用于提高利用率的API或参数?在我的情况下,GPU是否可能正在等待CPU?其他人发现哪些策略有用?