Question

我正在编写一种算法，该算法需要能够快速相乘矩阵。我尝试使用线程，然后考虑使用GPU。

我最初想使用CUDA，但无法使其正常工作，所以我使用了OpenCL。

我使用了在Internet上找到的代码，并对其进行了更改以使其可以与我的程序配合使用。

但是，GPU内存会不断增加，直到没有剩余的内存为止。该代码似乎可以正确释放内存。

你知道怎么了吗？

这是我用来加载OpenCL并乘以矩阵的代码：

/***************
  Copyright (c) 2015, MedicineYeh
  All rights reserved.

  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************/

#include <stdio.h>
#include <stdlib.h>    
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <CL/cl.h>
#include "matrix.h"

#define checkErr(fun, statement) err = fun;\
                                 if (err != CL_SUCCESS) {statement}
#define checkExit(value, message) if (value == 0) {printf(message); goto release;}

//define function 

double get_event_exec_time(cl_event event)
{
    cl_ulong start_time, end_time;
    /*Get start device counter for the event*/
    clGetEventProfilingInfo (event,
            CL_PROFILING_COMMAND_START,
            sizeof(cl_ulong),
            &start_time,
            NULL);
    /*Get end device counter for the event*/
    clGetEventProfilingInfo (event,
            CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong),
            &end_time,
            NULL);
    /*Convert the counter values to milli seconds*/
    double total_time = (end_time - start_time) * 1e-6;
    return total_time;
}

cl_program load_program(cl_context context, cl_device_id device, const char* filename)
{
    FILE *fp = fopen(filename, "rt");
    size_t length;
    char *data;
    char *build_log;
    size_t ret_val_size;
    cl_program program = 0;
    cl_int status = 0;
    if(!fp) return 0;

    // get file length
    fseek(fp, 0, SEEK_END);
    length = ftell(fp);
    fseek(fp, 0, SEEK_SET);

    // read program source
    data = (char *)malloc(length + 1);
    fread(data, sizeof(char), length, fp);
    data[length] = '\0';

    // create and build program 
    program = clCreateProgramWithSource(context, 1, (const char **)&data, 0, 0);
    if (program == 0) return 0;

    status = clBuildProgram(program, 0, 0, 0, 0, 0);
    if (status != CL_SUCCESS) {
        printf("Error:  Building Program from file %s\n", filename);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
        build_log = (char *)malloc(ret_val_size + 1);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
        build_log[ret_val_size] = '\0';
        printf("Building Log:\n%s", build_log);
        return 0;
    }

    return program;
}

void gpu_mul(matrix m1, matrix m2, matrix r)
{
    cl_int err = 0;
    cl_uint num = 0;
    cl_platform_id *platforms = NULL;
    cl_context_properties prop[3] = {0};
    cl_context context = 0;
    cl_device_id *devices = NULL;
    cl_command_queue queue = 0;
    cl_program program = 0;
    cl_mem cl_a = 0, cl_b = 0, cl_res = 0;
    cl_kernel adder = 0;
    cl_event event;
    unsigned int num_total_devices = 0;
    char devname[16][256] = ;
    size_t cb, work_size;
    unsigned int i;

    int m = m1.rows;
    int n = m1.columns;
    int p = m2.columns;
    double *a = m1.value;
    double *b = m2.value;
    double *res = r.value;

    checkErr(clGetPlatformIDs(0, 0, &num), 
             printf("Unable to get platforms\n");
             return;
             );

    platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num);
    checkErr(clGetPlatformIDs(num, platforms, NULL), 
             printf("Unable to get platform ID\n");
             return;
             );

    checkErr(clGetPlatformIDs(0, 0, &num), 
             printf("Unable to get platforms\n");
             return;
             );

    //printf("Found %d platforms:\n", num);
    for (i = 0; i < num; i++) {
        char str[1024];
        clGetPlatformInfo (platforms[i], CL_PLATFORM_NAME, 1024, str, NULL);
        //printf("\t%d: %s\n", i, str);
    }

    prop[0] = CL_CONTEXT_PLATFORM;
    prop[1] = (cl_context_properties)platforms[0];
    prop[2] = 0;
    context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, NULL);
    checkExit(context, "Can't create OpenCL context\n");

    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
    devices = (cl_device_id *)malloc(cb);
    clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, 0);
    checkExit(cb, "Can't get devices\n");
    num_total_devices = cb / sizeof(cl_device_id);

    //printf("Found %d devices:\n", num_total_devices);
    for (i = 0; i < num_total_devices; i++) {
        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, devname[i], 0);
        //printf("\t%d: %s", i, devname[i]);
        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &cb, 0);
        //printf("  - %d\n", (int)cb);
    }

    //Specify the queue to be profile-able
    queue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, 0);
    checkExit(queue, "Can't create command queue\n");

    program = load_program(context, devices[0], "matrixmul_kernel.cl");
    checkExit(program, "Fail to build program\n");

    cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * m * n, a, NULL);
    cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * n * p, b, NULL);
    cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double) * m * p , res, NULL);
    if (cl_a == 0 || 
        cl_b == 0 || 
        cl_res == 0) {
            if(cl_a==0)
                printf("Can't create OpenCL buffer (cl_a) \n");
            if(cl_b==0)
                printf("Can't create OpenCL buffer (cl_b) \n");
            if(cl_res==0)
                printf("Can't create OpenCL buffer (cl_res) \n");
        goto release;
    }
//    adder = clCreateKernel(program, "adder", &err);
    adder = clCreateKernel(program, "test", &err);
    if (err == CL_INVALID_KERNEL_NAME) printf("CL_INVALID_KERNEL_NAME\n");
    checkExit(adder, "Can't load kernel\n");

    clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
    clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
    clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);
    clSetKernelArg(adder, 3, sizeof(cl_int), &m);
    clSetKernelArg(adder, 4, sizeof(cl_int), &n);
    clSetKernelArg(adder, 5, sizeof(cl_int), &p);
    work_size = m * p;

    checkErr(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event),
             printf("Can't enqueue kernel\n");
            );
    checkErr(clEnqueueReadBuffer(queue, cl_res, CL_TRUE, 0, sizeof(double) * work_size, res, 0, 0, 0),
             printf("Can't enqueue read buffer\n");
            );
    clWaitForEvents(1, &event);
    //printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event));

    //Make sure everything is done before we do anything
    clFinish(queue);

release:
    clReleaseKernel(adder);
    clReleaseProgram(program);
    clReleaseMemObject(cl_a);
    clReleaseMemObject(cl_b);
    clReleaseMemObject(cl_res);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
}

Answer 1

您每次迭代泄漏多少个字节？这将使您对正在泄漏的内容有所了解。如果很小，我注意到您在clEnqueueNDRangeKernel中使用了cl_event对象（您使用clWaitForEvents调用了该对象），但是之后再也没有使用clReleaseEvent释放它。对于其他项目，您可以检查参考计数，以查看是否有您不知道的参考。

C语言中的OpenCL：无法释放GPU内存

1 个答案: