我已经使用Tensorflow的C API编写了自己的代码,以在C ++ Fluid Dynamics模拟程序中进行推理(=使用经过训练的人工神经网络)。但是在某些时候,计算会停止并给我这个错误:
mpirun noticed that process rank 10 with PID 0 on node node134 exited on signal 9 (Killed).
我同时注意到,这可能是由于没有剩余内存的事实而发生的:在计算停止的那一刻,RAM和Swp都被完全占用。
我不明白为什么会这样。但是自从程序运行无错误以来,我唯一更改的就是添加到其中的代码。
在流体动力学软件中,我对此进行了编程:
auto t_start_0 = std::chrono::high_resolution_clock::now();
const char* frozenGraphName = "/home/elias/Lr75-57_FPVANN_premix/data/FPV_ANN_tabulated_Standard_500.pb";
const char* inputOperationName = "input_1";
const char* outputOperationName = "dense_2/BiasAdd";
int no_of_inputs = in_mean.size();
int no_of_outputs = out_mean.size();
int cellsAndPatches = (input_f_zeta_PVNorm.size())/no_of_inputs;
std::vector<int64_t> input_dimensions = {cellsAndPatches,no_of_inputs};
std::vector<int64_t> output_dimensions = {cellsAndPatches,no_of_outputs};
Inference* inf = new Inference(frozenGraphName,inputOperationName,outputOperationName,no_of_inputs,no_of_outputs,input_dimensions,output_dimensions,cellsAndPatches);
output_real = inf->doInference(input_f_zeta_PVNorm);
delete inf;
auto t_end_0 = std::chrono::high_resolution_clock::now();
auto total_0 = std::chrono::duration<float, std::milli>(t_end_0 - t_start_0).count();
std::cout << "TOTAL INFERENCE TIME C API: " << total_0 << std::endl;
我的类推断的构造函数如下:
Inference::Inference(const char* fgn, const char* iname, const char* oname, int nIn, int nOut, std::vector<int64_t> dimIn,std::vector<int64_t> dimOut, int CP):no_input_sizes(nIn),no_output_sizes(nOut),noCellsPatches(CP)
{
TF_Buffer* graph_def = read_file(fgn);
graph = TF_NewGraph();
status = TF_NewStatus();
TF_ImportGraphDefOptions* graph_opts = TF_NewImportGraphDefOptions();
TF_GraphImportGraphDef(graph, graph_def, graph_opts, status);
if(TF_GetCode(status)!=TF_OK)
{
std::cout << "ERROR: Unable to import graph " << TF_Message(status) << std::endl;
}
num_bytes_in = noCellsPatches*no_input_sizes*sizeof(float);
num_bytes_out = noCellsPatches*no_output_sizes*sizeof(float);
in_dims = dimIn;
out_dims = dimOut;
in_name = strdup(iname);
out_name = strdup(oname);
TF_DeleteImportGraphDefOptions(graph_opts);
TF_DeleteBuffer(graph_def);
}
doInference方法如下:
std::vector<float> Inference::doInference(std::vector<float> inVals)
{
assert((inVals.size()%no_input_sizes)==0);
std::cout << "EFFECTIVE BATCH SIZE: " << inVals.size() << std::endl;
float **normalizedInputs = new float* [noCellsPatches]; // allocate pointers
normalizedInputs[0] = new float [noCellsPatches*no_input_sizes]; // allocate data
// set pointers
for (int i = 1; i < noCellsPatches; ++i) {
normalizedInputs[i] = &normalizedInputs[i-1][no_input_sizes];
}
for(int i=0;i<noCellsPatches;i++)
{
for(int j=0;j<no_input_sizes;j++)
{
normalizedInputs[i][j]=inVals.at(no_input_sizes*i+j);
}
}
const char* iname = in_name;
TF_Operation* input_op = TF_GraphOperationByName(graph,iname); // assure string value is correct by viewing the frozen graph in Tensorboard
TF_Output input = {input_op,0};
inputs = &input;
assert(inputs!=0);
const char* oname = out_name;
TF_Operation* output_op = TF_GraphOperationByName(graph,oname); // assure string value is correct by viewing the frozen graph in Tensorboard
TF_Output output = {output_op,0};
outputs = &output;
assert(outputs!=0);
int64_t in_dims_arr[] = {noCellsPatches,no_input_sizes};
TF_Tensor* input_value = TF_NewTensor(TF_FLOAT,in_dims_arr,2,&normalizedInputs[0][0],num_bytes_in,&Deallocator, 0); // normalizedInputs at Arg 4 before
TF_Tensor* const input_value_const = input_value; // const pointer to TF_Tensor
TF_Tensor* const* input_values = &input_value_const; // pointer to const pointer to TF_Tensor
assert(input_values!=0);
int64_t out_dims_arr[] = {noCellsPatches,no_output_sizes};
TF_Tensor* output_value = TF_AllocateTensor(TF_FLOAT, out_dims_arr, 2, num_bytes_out); // pointer to TF_Tensor //Arg2!
TF_Tensor** output_values = &output_value; // pointer to pointer to TF_Tensor
assert(output_values!=0);
std::cout << "Running session..." << std::endl;
TF_SessionOptions* sess_opts = TF_NewSessionOptions();
int limitCPUThreads = 1; // if you want to limit the inference to a number of CPU Threads you can do that here
int limitNumberOfCPUs = 0;
if((limitCPUThreads!=0)&&(limitNumberOfCPUs!=0))
{
std::cout << "ERROR! You cannnot limit both number of CPUs and number of threads!" << std::endl;
}
if((limitCPUThreads!=0)&&(limitNumberOfCPUs==0))
{
std::cout << "WARNING! You are limiting CPU inference to " << limitCPUThreads << " CPU Thread(s) / Core(s)!" << std::endl;
uint8_t intra_op_parallelism_threads = limitCPUThreads; // for operations that can be parallelized internally, such as matrix multiplication
uint8_t inter_op_parallelism_threads = limitCPUThreads; // for operationss that are independent in your TensorFlow graph because there is no directed path between them in the dataflow graph
uint8_t config[]={0x10,intra_op_parallelism_threads,0x28,inter_op_parallelism_threads};
TF_SetConfig(sess_opts,config,sizeof(config),status);
if (TF_GetCode(status) != TF_OK)
{
printf("ERROR: %s\n", TF_Message(status));
}
}
if((limitCPUThreads==0)&&(limitNumberOfCPUs!=0)) // HIER SCHEINT NOCH ETWAS NICHT ZU STIMMEN!
{
std::cout << "WARNING! You are limiting CPU inference to " << limitNumberOfCPUs << " CPU(s)!" << std::endl;
uint8_t numberOfCPUs = limitNumberOfCPUs;
uint8_t config[] = {0xa, 0x7, 0xa, 0x3, 0x43, 0x50, 0x55, 0x10, 0x01};
std::cout << config << std::endl;
TF_SetConfig(sess_opts,config,sizeof(config),status);
if (TF_GetCode(status) != TF_OK)
{
printf("ERROR: %s\n", TF_Message(status));
}
}
TF_Session* session = TF_NewSession(graph, sess_opts, status);
assert(TF_GetCode(status)==TF_OK);
auto t_start = std::chrono::high_resolution_clock::now();
TF_SessionRun(session,nullptr,inputs,input_values,1,outputs,output_values,1,nullptr,0,nullptr,status);
auto t_end = std::chrono::high_resolution_clock::now();
auto total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
std::cout << "time required for inference: " << total << std::endl;
float* out_vals = static_cast<float*>(TF_TensorData(*output_values));
std::vector<float> results(no_output_sizes*noCellsPatches,0);
for(int i=0;i<noCellsPatches;i++)
{
for(int j=0;j<no_output_sizes;j++)
{
results.at(i*no_output_sizes+j) = *out_vals;
out_vals++;
}
}
std::cout << "Successfully ran session!" << std::endl;
TF_CloseSession(session,status);
TF_DeleteSession(session,status);
TF_DeleteSessionOptions(sess_opts);
delete [] normalizedInputs[0];
delete [] normalizedInputs;
return results;
}
是否存在我不认识的某种内存泄漏?还是它可以运行数百个时间步然后崩溃的原因是什么?
谢谢!