我正在使用Windows上official site的Tensorflow 1.14.0 C API预编译二进制文件进行语义分割。我已经使用this repo在python中训练了模型,并将其转换为.pb,加载了模型并使用C API进行了预测,除了我在运行代码时发现的内存泄漏外,其他一切都正常。我试图找出内存泄漏的源头,并在Visual Studio 2017中使用了堆分析工具。两次预测的快照差异的类型视图没有显示新的分配,但是在堆栈视图中,我发现了这一点:
Identifier, Count Diff, Size Diff, Count, Size, Module
ModelPredict +3 +1 048 735 7 2 097 870 TensorflowModel.dll
[Unknown Frame] +3 +1 048 735 7 2 097 870 tensorflow.dll
我在第二次预测的快照中发现了这个(这就是为什么大小加倍的原因),其中TensorflowModel.dll是我用以下预测代码编写的dll:
typedef struct model_t {
TF_Graph* graph;
TF_Session* session;
TF_Status* status;
TF_Output input, target, output;
TF_Operation *init_op, *train_op, *save_op, *restore_op;
TF_Output checkpoint_file;
} model_t;
typedef struct NetProperties {
int width;
int height;
int border;
int classes;
int inputSize;
} NetProperties;
static void Deallocator(void* data, size_t length, void* arg) {
free(data);
}
static model_t * model;
static NetProperties * properties;
extern "C" EXPORT int ModelCreate(const char* nnFilename, const char* inputName, const char* outputName, int pictureWidth, int pictureHeight, int border, int classes) {
ModelDestroy();
model = (model_t*)malloc(sizeof(model_t));;
model->status = TF_NewStatus();
model->graph = TF_NewGraph();
properties = (NetProperties*)malloc(sizeof(NetProperties));
properties->width = pictureWidth;
properties->height = pictureHeight;
properties->border = border;
properties->classes = classes;
properties->inputSize = (pictureWidth + border * 2) * (pictureHeight + border * 2) * 3;
{
// Create the session.
TF_SessionOptions* opts = TF_NewSessionOptions();
model->session = TF_NewSession(model->graph, opts, model->status);
TF_DeleteSessionOptions(opts);
if (!Okay(model->status)) return 0;
}
TF_Graph* g = model->graph;
{
// Import the graph.
TF_Buffer* graph_def = read_file(nnFilename);
if (graph_def == NULL) return 0;
printf("Read GraphDef of %zu bytes\n", graph_def->length);
TF_ImportGraphDefOptions* opts = TF_NewImportGraphDefOptions();
TF_GraphImportGraphDef(g, graph_def, opts, model->status);
TF_DeleteImportGraphDefOptions(opts);
TF_DeleteBuffer(graph_def);
if (!Okay(model->status)) return 0;
}
// Handles to the interesting operations in the graph.
model->input.oper = TF_GraphOperationByName(g, inputName);
model->input.index = 0;
model->target.oper = TF_GraphOperationByName(g, "target");
model->target.index = 0;
model->output.oper = TF_GraphOperationByName(g, outputName);
model->output.index = 0;
model->init_op = TF_GraphOperationByName(g, "init");
model->train_op = TF_GraphOperationByName(g, "train");
model->save_op = TF_GraphOperationByName(g, "save/control_dependency");
model->restore_op = TF_GraphOperationByName(g, "save/restore_all");
model->checkpoint_file.oper = TF_GraphOperationByName(g, "save/Const");
model->checkpoint_file.index = 0;
unsigned char * randomData = (unsigned char*)malloc(properties->inputSize * sizeof(unsigned char));
for (int i = 0; i < properties->inputSize; i++) {
randomData[i] = (unsigned char)100;
}
ModelPredict(randomData, false, false);
free(randomData);
return 1;
}
extern "C" EXPORT void ModelDestroy() {
if (model == nullptr) return;
TF_DeleteSession(model->session, model->status);
Okay(model->status);
TF_DeleteGraph(model->graph);
TF_DeleteStatus(model->status);
free(model);
free(properties);
}
extern "C" EXPORT unsigned char* ModelPredict(unsigned char * batch, bool needRevertRgb, bool needRevertUpdsideDown) {
if (model == NULL) return NULL;
const int64_t dims[4] = { 1, properties->height, properties->width, 3 };
size_t nbytes = properties->inputSize;
// can be faster
float * arrayOfFloats = (float*)malloc(nbytes * sizeof(float));
float multiplier = 1.0;
if (needRevertUpdsideDown) {
for (int i = properties->height + properties->border * 2 - 1; i >= 0; i--) {
int width = (properties->width + properties->border * 2) * 3;
for (int j = 0; j < properties->width + properties->border * 2; j++) {
if (needRevertRgb) {
arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 2] * multiplier;
arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 0] * multiplier;
}
else {
arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 0] * multiplier;
arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 2] * multiplier;
}
}
}
}
else {
for (int i = 0; i < properties->height + properties->border * 2; i++) {
int width = (properties->width + properties->border * 2) * 3;
for (int j = 0; j < properties->width + properties->border * 2; j++) {
if (needRevertRgb) {
arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 2] * multiplier;
arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 0] * multiplier;
}
else {
arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 0] * multiplier;
arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 2] * multiplier;
}
}
}
}
// Optionally, you can check that your input_op and input tensors are correct
//// by using some of the functions provided by the C API.
//std::cout << "Input op info: " << TF_OperationNumOutputs(input_op) << "\n";
//std::cout << "Input data info: " << TF_Dim(input, 0) << "\n";
std::vector<TF_Output> inputs;
std::vector<TF_Tensor*> input_values;
TF_Operation* input_op = model->input.oper;
TF_Output input_opout = { input_op, 0 };
inputs.push_back(input_opout);
TF_Tensor* input = TF_NewTensor(TF_FLOAT, dims, 4, (void*)arrayOfFloats, nbytes * sizeof(float), &Deallocator, NULL);
input_values.push_back(input);
int outputSize = properties->width * properties->height * properties->classes;
int64_t out_dims[] = { 1, properties->height, properties->width, properties->classes };
// Create vector to store graph output operations
std::vector<TF_Output> outputs;
TF_Operation* output_op = model->output.oper;
TF_Output output_opout = { output_op, 0 };
outputs.push_back(output_opout);
// Create TF_Tensor* vector
//std::vector<TF_Tensor*> output_values(outputs.size(), nullptr);
// Similar to creating the input tensor, however here we don't yet have the
// output values, so we use TF_AllocateTensor()
TF_Tensor* output_value = TF_AllocateTensor(TF_FLOAT, out_dims, 4, outputSize * sizeof(float));
//output_values.push_back(output_value);
//// As with inputs, check the values for the output operation and output tensor
//std::cout << "Output: " << TF_OperationName(output_op) << "\n";
//std::cout << "Output info: " << TF_Dim(output_value, 0) << "\n";
TF_SessionRun(model->session, NULL,
&inputs[0], &input_values[0], inputs.size(),
&outputs[0], &output_value, outputs.size(),
/* No target operations to run */
NULL, 0, NULL, model->status);
if (!Okay(model->status)) return NULL;
TF_DeleteTensor(input_values[0]);
// memory allocations take place here
//float* prediction = (float*)TF_TensorData(output_value);
float* prediction = (float*)malloc(sizeof(float) * outputSize);
memcpy(prediction, TF_TensorData(output_value), sizeof(float) * outputSize);
unsigned char * charPrediction = new unsigned char[outputSize * sizeof(unsigned char)];
for (int i = 0; i < outputSize; i++) {
charPrediction[i] = (unsigned char)((prediction[i] * 255));
}
free(prediction);
TF_DeleteTensor(output_value);
return charPrediction;
}
在代码运行TF_SessionRun
时分配并没有释放这1048735字节。
现在我不明白这是我的错误,还是应该向github上的tensorflow回购提交问题。
更新1:
我发现泄漏内存的大小取决于型号。我加载了自写的简单U-Net,泄漏的大小减少到153823字节,但在每次TF_SessionRun
之后仍然存在。此模型的Python代码:
def test_unet(pretrained_weights = None,input_size = 284, border=88):
inputs = Input(((input_size + border * 2) * (input_size + border * 2) * 3,))
reshape1 = Reshape((input_size + border * 2 , input_size + border * 2, 3))(inputs)
# 284
conv0 = Conv2D(64, 3 , activation = 'relu', kernel_initializer = 'he_normal')(reshape1)
# 282
conv0 = Conv2D(64, 3, activation = 'relu', kernel_initializer = 'he_normal')(conv0)
# 280
conv0 = BatchNormalization()(conv0)
pool0 = MaxPooling2D(pool_size=(2, 2))(conv0)
# 140
conv1 = Conv2D(128, 3 , activation = 'relu', kernel_initializer = 'he_normal')(pool0)
# 138
conv1 = Conv2D(128, 3, activation = 'relu', kernel_initializer = 'he_normal')(conv1)
# 136
conv1 = BatchNormalization()(conv1)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
# 68
conv2 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(pool1)
# 66
conv2 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(conv2)
# 64
conv2 = BatchNormalization()(conv2)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
# 32
conv3 = Conv2D(512, 3, activation = 'relu', kernel_initializer = 'he_normal')(pool2)
# 30
conv3 = Conv2D(512, 3, activation = 'relu', kernel_initializer = 'he_normal')(conv3)
# 28
conv3 = BatchNormalization()(conv3)
pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
# added
conv3_1 = Conv2D(1024, 3, activation = 'relu', kernel_initializer = 'he_normal', padding='same')(pool3)
conv3_1 = Conv2D(1024, 3, activation = 'relu', kernel_initializer = 'he_normal', padding='same')(conv3_1)
conv3_1 = BatchNormalization()(conv3_1)
drop1 = Dropout(0.5)(conv3_1)
up0 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal', padding='same')(UpSampling2D(size = (2,2))(drop1))
up0 = BatchNormalization()(up0)
# 56
up1 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(up0))
up1 = BatchNormalization()(up1)
# 54
cropp1 = Cropping2D((5,5))(conv2)
merge1 = concatenate([cropp1,up1], axis = 3)
conv4 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(merge1)
# 52
#conv4 = BatchNormalization()(conv4)
# 104
up2 = Conv2D(128, 2, activation = 'relu', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv4))
up2 = BatchNormalization()(up2)
# 103? wtf
up2 = Cropping2D(((1,0),(1,0)))(up2)
cropp2 = Cropping2D((17,17))(conv1)
merge2 = concatenate([cropp2,up2], axis = 3)
conv5 = Conv2D(128, 3, activation = 'relu', kernel_initializer = 'he_normal')(merge2)
# 100
conv5 = BatchNormalization()(conv5)
# 200
up3 = Conv2D(64, 2, activation = 'relu', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv5))
up3 = BatchNormalization()(up3)
# 199
up3 = Cropping2D(((1,0),(1,0)))(up3)
cropp3 = Cropping2D((41,41))(conv0)
merge3 = concatenate([cropp3,up3], axis = 3)
conv6 = Conv2D(64, 3, activation = 'relu', kernel_initializer = 'he_normal')(merge3)
# 196
conv7 = Conv2D(1, 1, activation = 'sigmoid')(conv6)
reshape2 = Reshape((input_size * input_size,))(conv7)
model = Model(inputs=inputs, outputs=reshape2)
model.compile(optimizer=RMSprop(lr=0.0001), loss=bce_dice_loss, metrics=[dice_coeff])
model.summary()
if(pretrained_weights):
model.load_weights(pretrained_weights)
return model
UDPATE 2:
我找出什么是内存泄漏。泄漏的大小= input_size + output_size +159。我更改了图像的大小,并注意到泄漏的大小取决于输入图像的大小。因此,最初的泄漏是图像512 * 512 * 3(对于rgb)+输出512 * 512 * 1(一类)+神秘的159字节= 1048735的泄漏。现在看来,我错过了一些会话来免费传递命令张量,TF_DeleteTensor();
释放了内存,但仍然分配了一些东西。