Question

我最近开始使用OpenCL（英特尔SDK，i7700）。我试图实现一个简单的k-means算法，虽然它完全正常，但主机程序在尝试释放输出缓冲区时总是崩溃（clReleaseMemObject（output_z_buf）;）。释放其他缓冲区工作得很好。我整天都在盯着代码，但我无法弄清楚我做错了什么。有想法的人吗？

#include <iostream>
#include <vector>
#include <string>
#include <fstream>

#ifdef __APPLE__
    #include "OpenCL/opencl.h"
#else
    #include <CL\cl.h>
#endif

using namespace std;

static void CheckForError(int success);
static void display_device_info(cl_device_id device);

const char* programPath = "kmeans.cl";

int main()
{
    unsigned platformIDCount = 0;
    unsigned selectedPlatform = 0;
    unsigned deviceIDCount = 0;
    unsigned selectedDevice = 0;
    size_t stringSize = 0;
    string stringResult = "";
    cl_event kernel_event;
    cl_event finish_event;
    int success = CL_SUCCESS;

    // Get amount of available platforms
    clGetPlatformIDs(0, NULL, &platformIDCount);
    if (platformIDCount == 0) {
        cerr << "No OpenCL platform found" << endl;
        return 1;
    }
    else {
        cout << "Found " << platformIDCount << " platforms:" << endl;
    }

    // Get IDs of available platforms
    vector<cl_platform_id> platformIDs(platformIDCount);
    clGetPlatformIDs(platformIDCount, platformIDs.data(), NULL);

    // Get names of available platforms
    for (unsigned platformID = 0; platformID < platformIDCount; platformID++) {
        clGetPlatformInfo(platformIDs[platformID], CL_PLATFORM_NAME, 0, NULL, &stringSize);
        stringResult.resize(stringSize);
        clGetPlatformInfo(platformIDs[platformID], CL_PLATFORM_NAME, stringSize, const_cast<char*> (stringResult.data()), NULL);

        cout << (platformID) << ": " << stringResult << endl;
    }

    // Select platform
    cout << "Please select a platform: " << endl;
    cin >> selectedPlatform;

    // Get amount of available devices
    deviceIDCount = 0;
    clGetDeviceIDs(platformIDs[selectedPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceIDCount);
    if (deviceIDCount == 0) {
        cerr << "No OpenCL devices found" << endl;
        return 1;
    }
    else {
        cout << "Found " << deviceIDCount << " devices:" << endl;
    }

    // Get IDs of available devices
    vector<cl_device_id> deviceIDs(deviceIDCount);
    clGetDeviceIDs(platformIDs[selectedPlatform], CL_DEVICE_TYPE_ALL, deviceIDCount, deviceIDs.data(), NULL);

    // Get names of available devices
    for (unsigned deviceID = 0; deviceID < deviceIDCount; deviceID++) {
        clGetDeviceInfo(deviceIDs[deviceID], CL_DEVICE_NAME, 0, NULL, &stringSize);
        stringResult.resize(stringSize);
        clGetDeviceInfo(deviceIDs[deviceID], CL_DEVICE_NAME, stringSize, const_cast<char*> (stringResult.data()), NULL);

        cout << (deviceID) << ": " << stringResult << endl;
    }

    // Select device
    cout << "Please select a device: " << endl;
    cin >> selectedDevice;

    // Print selected device info
    display_device_info(deviceIDs[selectedDevice]);

    // Create device context
    const cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties> (platformIDs[selectedPlatform]), 0, 0 };
    cl_context context = clCreateContext(contextProperties, deviceIDCount, deviceIDs.data(), NULL, NULL, &success);
    CheckForError(success);
    cout << "Context created" << endl;

    // Create program
    ifstream in(programPath);
    string programSource((istreambuf_iterator<char>(in)), istreambuf_iterator<char>());
    size_t lengths[1] = { programSource.size() };
    const char* sources[1] = { programSource.data() };
    cl_program program = clCreateProgramWithSource(context, 1, sources, lengths, &success);
    CheckForError(success);
    cout << "Program created" << endl;

    // Build program
    success = clBuildProgram(program, deviceIDCount, deviceIDs.data(), NULL, NULL, NULL);
    if (success == CL_BUILD_PROGRAM_FAILURE) {
        // Print Build Info Log
        size_t log_size;
        clGetProgramBuildInfo(program, deviceIDs[selectedDevice], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char *log = (char *)malloc(log_size);
        clGetProgramBuildInfo(program, deviceIDs[selectedDevice], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        printf("%s\n", log);
    }
    CheckForError(success);
    cout << "Program built" << endl;

    // Create kernel
    cl_kernel kernel = clCreateKernel(program, "kmeans", &success);
    CheckForError(success);
    cout << "Kernel created" << endl;

    // Generate test data
    // A 1024 height * 1024 width area
    // 256 random mean values for 256 k

    static const size_t testDataSize = 1024;
    static const unsigned kCount = 16;

    cl_short3 *input_x = (cl_short3 *)malloc(testDataSize * sizeof(cl_short3));
    cl_short3 *input_c = (cl_short3 *)malloc(kCount * sizeof(cl_short3));
    cl_short3 *output_z = (cl_short3 *)malloc(kCount * sizeof(cl_short3));

    for (int j = 0; j < testDataSize; ++j) {
        input_x[j].s0 = (rand() % (1 << 13)) - (1 << 13) / 2;
        input_x[j].s1 = (rand() % (1 << 13)) - (1 << 13) / 2;
        input_x[j].s2 = (rand() % (1 << 13)) - (1 << 13) / 2;
    }

    for (int j = 0; j < kCount; ++j) {
        int idx = rand() % testDataSize;
        input_c[j] = input_x[idx];
    }
    cout << "Test data generated" << endl;

    cl_mem input_x_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, testDataSize * sizeof(cl_short3), (void *) input_x, NULL);
    cl_mem input_c_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kCount * sizeof(cl_short3), (void *) input_c, NULL);
    cl_mem output_z_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, kCount * sizeof(cl_short3), (void *) output_z, NULL);
    cout << "Buffers created" << endl;

    // Create command queue
    cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceIDs[selectedDevice], 0, &success);
    CheckForError(success);
    cout << "Command queue created" << endl;

    // Set kernel arguments
    int argc = 0;
    clSetKernelArg(kernel, argc++, sizeof(cl_mem), &input_x_buf);
    clSetKernelArg(kernel, argc++, sizeof(cl_mem), &input_c_buf);
    clSetKernelArg(kernel, argc++, sizeof(cl_mem), &output_z_buf);

    // Set local memory arguments (if Scratchpad is available, otherwise fail)
    clSetKernelArg(kernel, argc++, 2 * sizeof(cl_long), NULL);
    clSetKernelArg(kernel, argc++, kCount * sizeof(cl_short3), NULL);
    clSetKernelArg(kernel, argc++, kCount * sizeof(cl_int), NULL);
    clSetKernelArg(kernel, argc++, kCount * sizeof(cl_long), NULL);
    clSetKernelArg(kernel, argc++, kCount * sizeof(cl_int3), NULL);

    clSetKernelArg(kernel, argc++, sizeof(unsigned), &kCount);
    cout << "Kernel arguments set" << endl;

    // Enqueue command to execute kernel on device
    const size_t globalWorkSize[] = { testDataSize, 0, 0 };
    success = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, globalWorkSize, 0, NULL, &kernel_event);
    //success = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
    CheckForError(success);
    //cout << "Command enqueued" << endl;

    // Read memory buffer for result data
    success = clEnqueueReadBuffer(queue, output_z_buf, CL_TRUE, 0, kCount * sizeof(cl_short3), output_z, 0, NULL, &finish_event);
    CheckForError(success);
    cout << "Memory buffer for result data read" << endl;

    success = clWaitForEvents(1, &finish_event);
    success += clWaitForEvents(1, &kernel_event);
    success += clFinish(queue);
    CheckForError(success);
    cout << "Command queue finished" << endl;

    cout << "New centers:" << endl;
    for (unsigned i = 0; i < kCount; i++) {
        cout << i << ": " << output_z[i].s0 << ", " << output_z[i].s1 << ", " << output_z[i].s2 << endl;
    }

    // Clean resources

    clReleaseKernel(kernel);
    clReleaseProgram(program);

    clReleaseMemObject(input_x_buf);
    clReleaseMemObject(input_c_buf);
    clReleaseMemObject(output_z_buf);

    clReleaseCommandQueue(queue);

    clReleaseContext(context);

    return 0;
}

更新我没有完全解决问题，但对于有类似问题的未来读者，我就是这样解决的。首先，我根据并行的高速公路建议更改了主机代码：

// Read memory buffer for result data
success = clEnqueueReadBuffer(queue, output_z_buf, CL_TRUE, 0, kCount * sizeof(cl_short3), output_z, 1, &kernel_event, &finish_event);
CheckForError(success);
cout << "Memory buffer for result data read" << endl;

success = clFlush(queue);
CheckForError(success);
success = clWaitForEvents(1, &finish_event);
CheckForError(success);
success = clFinish(queue);
CheckForError(success);
cout << "Command queue finished" << endl;

cout << "New centers:" << endl;
for (unsigned i = 0; i < kCount; i++) {
    cout << i << ": " << output_z[i].s0 << ", " << output_z[i].s1 << ", " << output_z[i].s2 << endl;
}

// Clean resources
clReleaseMemObject(input_x_buf);
clReleaseMemObject(input_c_buf);
clReleaseMemObject(output_z_buf);
clReleaseCommandQueue(queue);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
return 0;

虽然这应该可以解决output_z_buf无法释放的问题，但不知道怎么回事。我发现内核代码对这个问题有影响。当然，我在内核代码的最后写入输出缓冲区。但是，如果我这样做，在内核完成后，缓冲区在某种程度上不会完全关闭，从而在释放缓冲区时导致崩溃。当我写入输出缓冲区后，当我执行其他操作（实际操作，单独屏障无效）时，问题不会出现。然后代码看起来像这样：

// The algorithm doing its stuff
output_z_buf[gid0].s0 = x;
output_z_buf[gid0].s1 = y;
output_z_buf[gid0].s2 = z;

for (int i = 0; i < 10; i++) {
    // do some other stuff
}
barrier(CLK_LOCAL_MEM_FENCE);
// End of Kernel code

崩溃与＆＃34;检测到严重错误c0000374＆＃34;在OpenCL

0 个答案: