Question

继续我的OpenCL冒险，这是我迄今为止从我的CUDA实施中得到的。我试图检查是否至少第一个内核调用正在工作，但我收到错误 48 并且不知道我错过了什么。我正在关注this page

中的示例

KERNEL

__kernel
void clut_distributePixels(__global int *pixelGroup, int c_rows, int c_cols, int c_numColors){

    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x >= c_cols || y >= c_rows) return;

    int index = y * c_cols + x;

    pixelGroup[index] = index/c_numColors;

}

从文件中读取内核

char *file_contents(const char *filename, int *length){
    FILE *f = fopen(filename, "r");
    void *buffer;

    if (!f) {
        fprintf(stderr, "Unable to open %s for reading\n", filename);
        return NULL;
    }

    fseek(f, 0, SEEK_END);
    *length = ftell(f);
    fseek(f, 0, SEEK_SET);

    buffer = malloc(*length+1);
    *length = fread(buffer, 1, *length, f);
    fclose(f);
    ((char*)buffer)[*length] = '\0';

    return (char*)buffer;
}

CODE

#include <iostream>
#include <OpenCL/OpenCL.h>

#include "Utilities.hpp"

int main(int argc, const char * argv[]){

    if (argc < 3) {
        std::cout << "Use: {GPU|CPU} nColors" << std::endl;
        return 1;
    }

    /************************************************
            HOST SIDE INITIALIZATION
     ************************************************/
    int h_numColors = atoi(argv[2]);

    Color *h_image;
    int h_rows, h_cols;
    if (readText2RGB("LenaOriginal.txt", &h_image, &h_rows, &h_cols) != SUCCESS){
        return 1;
    }

    int *h_pixelGroup = new int[h_rows*h_cols];
    Color *h_groupRep = new Color[h_numColors];
    Color *h_clutImage = new Color[h_rows*h_cols];
    int h_change = 0;

    /************************************************
                PLATFORM AND DEVICE SETUP
    ************************************************/

    cl_int errorStatus;

    //Use the first platform
    cl_platform_id platform;
    errorStatus = clGetPlatformIDs(1, &platform, NULL);

    //Use the first device that matches the type selected
    cl_device_id device;
    if (strcmp(argv[1], "CPU")){
        errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    }else if (strcmp(argv[1], "GPU")){
        errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    }else{
        std::cout << "Unknown device type. Choose either CPU or GPU" << std::endl;
        return 1;
    }

    //Define context properties and create context
    cl_context_properties contextProps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
    cl_context context = clCreateContext(contextProps, 1, &device, NULL, NULL, &errorStatus);

    //Create the command queue
    cl_command_queue queue = clCreateCommandQueue(context, device, 0, &errorStatus);

    /************************************************
                DEVICE VARIABLE SETUP
     ************************************************/

    cl_mem d_image;
    cl_mem d_pixelGroup;
    cl_mem d_groupRep;
    cl_mem d_clutImage;
    cl_mem d_change;

    d_image = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Color)*h_rows*h_cols, h_image, &errorStatus);
    d_pixelGroup = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*h_rows*h_cols, NULL, &errorStatus);
    d_groupRep = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_numColors, NULL, &errorStatus);
    d_clutImage = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_rows*h_cols, NULL, &errorStatus);
    d_change = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &errorStatus);

    /************************************************
        CREATE, COMPILE PROGRAM and CREATE KERNEL
     ************************************************/

    int pl;
    size_t sourceLength;
    char * sourceCode = file_contents("vectorQuantization.cl", &pl);
    sourceLength = (size_t)pl;

    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&sourceCode, &sourceLength, &errorStatus);

    errorStatus = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    cl_kernel k_clut_distributePixels = clCreateKernel(program, "clut_distributePixels", &errorStatus);
        errorStatus = clSetKernelArg(k_clut_distributePixels, 0, sizeof(cl_mem), (void*)&d_pixelGroup);
        errorStatus = clSetKernelArg(k_clut_distributePixels, 1, sizeof(cl_mem), (void*)&h_rows);
        errorStatus = clSetKernelArg(k_clut_distributePixels, 2, sizeof(cl_mem), (void*)&h_cols);
        errorStatus = clSetKernelArg(k_clut_distributePixels, 3, sizeof(cl_mem), (void*)&h_numColors);

    cl_kernel k_clut_checkDistances = clCreateKernel(program, "clut_checkDistances", &errorStatus);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 0, sizeof(cl_mem), (void*)&d_image);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 2, sizeof(cl_mem), (void*)&d_groupRep);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 3, sizeof(cl_mem), (void*)&h_rows);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 4, sizeof(cl_mem), (void*)&h_cols);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 5, sizeof(cl_mem), (void*)&h_numColors);
        errorStatus = clSetKernelArg(k_clut_checkDistances, 6, sizeof(cl_mem), (void*)&d_change);

    cl_kernel k_clut_createImage = clCreateKernel(program, "clut_createImage", &errorStatus);
        errorStatus = clSetKernelArg(k_clut_createImage, 0, sizeof(cl_mem), (void*)&d_clutImage);
        errorStatus = clSetKernelArg(k_clut_createImage, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
        errorStatus = clSetKernelArg(k_clut_createImage, 2, sizeof(cl_mem), (void*)&d_groupRep);
        errorStatus = clSetKernelArg(k_clut_createImage, 3, sizeof(cl_mem), (void*)&h_rows);
        errorStatus = clSetKernelArg(k_clut_createImage, 4, sizeof(cl_mem), (void*)&h_cols);

    /************************************************
            EXECUTE PROGRAM AND GET RESULTS
     ************************************************/

    /*STEP 1: evenly distribute pixels among the colors in the CLUT */
    size_t grid[2] = {static_cast<size_t>(h_rows), static_cast<size_t>(h_cols)};
    errorStatus = clEnqueueNDRangeKernel(queue, k_clut_distributePixels, 2, NULL, grid, NULL, 0, NULL, NULL);
    clFinish(queue);

    /*********/
    /* ERROR */
    /*********/
    errorStatus = clEnqueueReadBuffer(queue, d_pixelGroup, CL_TRUE, 0, sizeof(int)*h_rows*h_cols, h_pixelGroup, 0, NULL, NULL);

    std::cout << h_pixelGroup[7] << ", " << h_pixelGroup[8] << ", " << h_pixelGroup[9] << ", " << h_pixelGroup[10] << std::endl;

    //do {
        /*STEP 2: compute reprenstative */

        /*STEP 3: compute distances and reassign pixel to group */

        //copyFromConstantMemory
    //} while (h_change != 0);

    std::cout << "Done !!" << std::endl;

    return 0;
}

Answer 1

我发现了我的错误。首先，在学习新内容时始终检查返回值。我只记得从我学习CUDA的那一刻起，所以用这个简单的宏开始检查所有内容

#define CL_SUCCESS_OR_RETURN(code) do { \
    assert(code == CL_SUCCESS); \
    if (code != CL_SUCCESS) { return code; } \
}while (0);

当我检查它是CPU还是GPU时，错误就在最开始。我忘了strcmp在字符串相等时返回0。解决这个问题后，一切都很美妙!!

无论如何，如果您有任何其他建议或建议，或者您发现某些丑陋或不是代码中的最佳做法，请发表评论。

启动内核时OpenCL错误48

1 个答案: