Tensorflow 1.14.0 C API TF_SessionRun内存泄漏

时间:2019-07-29 14:11:43

标签: python c tensorflow

我正在使用Windows上official site的Tensorflow 1.14.0 C API预编译二进制文件进行语义分割。我已经使用this repo在python中训练了模型,并将其转换为.pb,加载了模型并使用C API进行了预测,除了我在运行代码时发现的内存泄漏外,其他一切都正常。我试图找出内存泄漏的源头,并在Visual Studio 2017中使用了堆分析工具。两次预测的快照差异的类型视图没有显示新的分配,但是在堆栈视图中,我发现了这一点:

Identifier,     Count Diff,  Size Diff, Count,       Size,               Module
ModelPredict            +3  +1 048 735      7   2 097 870   TensorflowModel.dll
     [Unknown Frame]    +3  +1 048 735      7   2 097 870   tensorflow.dll

我在第二次预测的快照中发现了这个(这就是为什么大小加倍的原因),其中TensorflowModel.dll是我用以下预测代码编写的dll:

typedef struct model_t {
    TF_Graph* graph;
    TF_Session* session;
    TF_Status* status;

    TF_Output input, target, output;

    TF_Operation *init_op, *train_op, *save_op, *restore_op;
    TF_Output checkpoint_file;
} model_t;

typedef struct NetProperties {
    int width;
    int height;
    int border;
    int classes;
    int inputSize;
} NetProperties;

static void Deallocator(void* data, size_t length, void* arg) {
    free(data);
}

static model_t * model;
static NetProperties * properties;

extern "C" EXPORT int ModelCreate(const char* nnFilename, const char* inputName, const char* outputName, int pictureWidth, int pictureHeight, int border, int classes) {
    ModelDestroy();
    model = (model_t*)malloc(sizeof(model_t));;
    model->status = TF_NewStatus();
    model->graph = TF_NewGraph();
    properties = (NetProperties*)malloc(sizeof(NetProperties));
    properties->width = pictureWidth;
    properties->height = pictureHeight;
    properties->border = border;
    properties->classes = classes;
    properties->inputSize = (pictureWidth + border * 2) * (pictureHeight + border * 2) * 3;
    {
        // Create the session.
        TF_SessionOptions* opts = TF_NewSessionOptions();
        model->session = TF_NewSession(model->graph, opts, model->status);
        TF_DeleteSessionOptions(opts);
        if (!Okay(model->status)) return 0;
    }

    TF_Graph* g = model->graph;

    {
        // Import the graph.
        TF_Buffer* graph_def = read_file(nnFilename);
        if (graph_def == NULL) return 0;
        printf("Read GraphDef of %zu bytes\n", graph_def->length);
        TF_ImportGraphDefOptions* opts = TF_NewImportGraphDefOptions();
        TF_GraphImportGraphDef(g, graph_def, opts, model->status);
        TF_DeleteImportGraphDefOptions(opts);
        TF_DeleteBuffer(graph_def);
        if (!Okay(model->status)) return 0;
    }

    // Handles to the interesting operations in the graph.
    model->input.oper = TF_GraphOperationByName(g, inputName);
    model->input.index = 0;
    model->target.oper = TF_GraphOperationByName(g, "target");
    model->target.index = 0;
    model->output.oper = TF_GraphOperationByName(g, outputName);
    model->output.index = 0;

    model->init_op = TF_GraphOperationByName(g, "init");
    model->train_op = TF_GraphOperationByName(g, "train");
    model->save_op = TF_GraphOperationByName(g, "save/control_dependency");
    model->restore_op = TF_GraphOperationByName(g, "save/restore_all");

    model->checkpoint_file.oper = TF_GraphOperationByName(g, "save/Const");
    model->checkpoint_file.index = 0;

    unsigned char * randomData = (unsigned char*)malloc(properties->inputSize * sizeof(unsigned char));
    for (int i = 0; i < properties->inputSize; i++) {
        randomData[i] = (unsigned char)100;
    }
    ModelPredict(randomData, false, false);
    free(randomData);
    return 1;
}

extern "C" EXPORT void ModelDestroy() {
    if (model == nullptr) return;
    TF_DeleteSession(model->session, model->status);
    Okay(model->status);
    TF_DeleteGraph(model->graph);
    TF_DeleteStatus(model->status);
    free(model);
    free(properties);
}

extern "C" EXPORT unsigned char* ModelPredict(unsigned char * batch, bool needRevertRgb, bool needRevertUpdsideDown) {
    if (model == NULL) return NULL;
    const int64_t dims[4] = { 1, properties->height, properties->width, 3 };
    size_t nbytes = properties->inputSize;

    // can be faster
    float * arrayOfFloats = (float*)malloc(nbytes * sizeof(float));
    float multiplier = 1.0;
    if (needRevertUpdsideDown) {
        for (int i = properties->height + properties->border * 2 - 1; i >= 0; i--) {
            int width = (properties->width + properties->border * 2) * 3;
            for (int j = 0; j < properties->width + properties->border * 2; j++) {
                if (needRevertRgb) {
                    arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 2] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 0] * multiplier;
                }
                else {
                    arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 0] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 2] * multiplier;
                }
            }
        }
    }
    else {
        for (int i = 0; i < properties->height + properties->border * 2; i++) {
            int width = (properties->width + properties->border * 2) * 3;
            for (int j = 0; j < properties->width + properties->border * 2; j++) {
                if (needRevertRgb) {
                    arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 2] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 0] * multiplier;
                }
                else {
                    arrayOfFloats[i * width + j * 3 + 0] = batch[i * width + j * 3 + 0] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 1] = batch[i * width + j * 3 + 1] * multiplier;
                    arrayOfFloats[i * width + j * 3 + 2] = batch[i * width + j * 3 + 2] * multiplier;
                }
            }
        }
    }


    // Optionally, you can check that your input_op and input tensors are correct
    //// by using some of the functions provided by the C API.
    //std::cout << "Input op info: " << TF_OperationNumOutputs(input_op) << "\n";
    //std::cout << "Input data info: " << TF_Dim(input, 0) << "\n";

    std::vector<TF_Output> inputs;
    std::vector<TF_Tensor*> input_values;
    TF_Operation* input_op = model->input.oper;
    TF_Output input_opout = { input_op, 0 };
    inputs.push_back(input_opout);

    TF_Tensor* input = TF_NewTensor(TF_FLOAT, dims, 4, (void*)arrayOfFloats, nbytes * sizeof(float), &Deallocator, NULL);
    input_values.push_back(input);

    int outputSize = properties->width * properties->height * properties->classes;

    int64_t out_dims[] = { 1, properties->height, properties->width, properties->classes };

    // Create vector to store graph output operations
    std::vector<TF_Output> outputs;
    TF_Operation* output_op = model->output.oper;
    TF_Output output_opout = { output_op, 0 };
    outputs.push_back(output_opout);

    // Create TF_Tensor* vector
    //std::vector<TF_Tensor*> output_values(outputs.size(), nullptr);

    // Similar to creating the input tensor, however here we don't yet have the
    // output values, so we use TF_AllocateTensor()
    TF_Tensor* output_value = TF_AllocateTensor(TF_FLOAT, out_dims, 4, outputSize * sizeof(float));
    //output_values.push_back(output_value);

    //// As with inputs, check the values for the output operation and output tensor
    //std::cout << "Output: " << TF_OperationName(output_op) << "\n";
    //std::cout << "Output info: " << TF_Dim(output_value, 0) << "\n";

    TF_SessionRun(model->session, NULL,
        &inputs[0], &input_values[0], inputs.size(),
        &outputs[0], &output_value, outputs.size(),
        /* No target operations to run */
        NULL, 0, NULL, model->status);
    if (!Okay(model->status)) return NULL;
    TF_DeleteTensor(input_values[0]);

    // memory allocations take place here
    //float* prediction = (float*)TF_TensorData(output_value);
    float* prediction = (float*)malloc(sizeof(float) * outputSize);
    memcpy(prediction, TF_TensorData(output_value), sizeof(float) * outputSize);
    unsigned char * charPrediction = new unsigned char[outputSize * sizeof(unsigned char)];
    for (int i = 0; i < outputSize; i++) {
        charPrediction[i] = (unsigned char)((prediction[i] * 255));
    }
    free(prediction);
    TF_DeleteTensor(output_value);
    return charPrediction;
}

在代码运行TF_SessionRun时分配并没有释放这1048735字节。 现在我不明白这是我的错误,还是应该向github上的tensorflow回购提交问题。

更新1:

我发现泄漏内存的大小取决于型号。我加载了自写的简单U-Net,泄漏的大小减少到153823字节,但在每次TF_SessionRun之后仍然存在。此模型的Python代码:

def test_unet(pretrained_weights = None,input_size = 284, border=88):
    inputs = Input(((input_size + border * 2) * (input_size + border * 2) * 3,))

    reshape1 = Reshape((input_size + border * 2 , input_size + border * 2, 3))(inputs)
    # 284
    conv0 = Conv2D(64, 3 , activation = 'relu',  kernel_initializer = 'he_normal')(reshape1)
    # 282
    conv0 = Conv2D(64, 3, activation = 'relu',  kernel_initializer = 'he_normal')(conv0)
    # 280
    conv0 = BatchNormalization()(conv0)
    pool0 = MaxPooling2D(pool_size=(2, 2))(conv0)

    # 140
    conv1 = Conv2D(128, 3 , activation = 'relu',  kernel_initializer = 'he_normal')(pool0)
    # 138
    conv1 = Conv2D(128, 3, activation = 'relu',  kernel_initializer = 'he_normal')(conv1)
    # 136
    conv1 = BatchNormalization()(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    # 68
    conv2 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(pool1)
    # 66
    conv2 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(conv2)
    # 64
    conv2 = BatchNormalization()(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

    # 32
    conv3 = Conv2D(512, 3, activation = 'relu', kernel_initializer = 'he_normal')(pool2)
    # 30
    conv3 = Conv2D(512, 3, activation = 'relu', kernel_initializer = 'he_normal')(conv3)
    # 28
    conv3 = BatchNormalization()(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

    # added
    conv3_1 = Conv2D(1024, 3, activation = 'relu', kernel_initializer = 'he_normal', padding='same')(pool3)
    conv3_1 = Conv2D(1024, 3, activation = 'relu', kernel_initializer = 'he_normal', padding='same')(conv3_1)
    conv3_1 = BatchNormalization()(conv3_1)
    drop1 = Dropout(0.5)(conv3_1)
    up0 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal', padding='same')(UpSampling2D(size = (2,2))(drop1))
    up0 = BatchNormalization()(up0)

    # 56
    up1 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(up0))
    up1 = BatchNormalization()(up1)
    # 54
    cropp1 = Cropping2D((5,5))(conv2)
    merge1 = concatenate([cropp1,up1], axis = 3)
    conv4 = Conv2D(256, 3, activation = 'relu', kernel_initializer = 'he_normal')(merge1)
    # 52
    #conv4 = BatchNormalization()(conv4)

    # 104
    up2 = Conv2D(128, 2, activation = 'relu', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv4))
    up2 = BatchNormalization()(up2)
    # 103? wtf

    up2 = Cropping2D(((1,0),(1,0)))(up2)
    cropp2 = Cropping2D((17,17))(conv1)
    merge2 = concatenate([cropp2,up2], axis = 3)
    conv5 = Conv2D(128, 3, activation = 'relu', kernel_initializer = 'he_normal')(merge2)
    # 100
    conv5 = BatchNormalization()(conv5)

    # 200
    up3 = Conv2D(64, 2, activation = 'relu', kernel_initializer = 'he_normal')(UpSampling2D(size = (2,2))(conv5))
    up3 = BatchNormalization()(up3)
    # 199
    up3 = Cropping2D(((1,0),(1,0)))(up3)
    cropp3 = Cropping2D((41,41))(conv0)
    merge3 = concatenate([cropp3,up3], axis = 3)
    conv6 = Conv2D(64, 3, activation = 'relu', kernel_initializer = 'he_normal')(merge3)
    # 196
    conv7 = Conv2D(1, 1, activation = 'sigmoid')(conv6)
    reshape2 = Reshape((input_size * input_size,))(conv7)
    model = Model(inputs=inputs, outputs=reshape2)

    model.compile(optimizer=RMSprop(lr=0.0001), loss=bce_dice_loss, metrics=[dice_coeff])

    model.summary()

    if(pretrained_weights):
        model.load_weights(pretrained_weights)

    return model

UDPATE 2:

我找出什么是内存泄漏。泄漏的大小= input_size + output_size +159。我更改了图像的大小,并注意到泄漏的大小取决于输入图像的大小。因此,最初的泄漏是图像512 * 512 * 3(对于rgb)+输出512 * 512 * 1(一类)+神秘的159字节= 1048735的泄漏。现在看来,我错过了一些会话来免费传递命令张量,TF_DeleteTensor();释放了内存,但仍然分配了一些东西。

0 个答案:

没有答案