OpenCL程序中的未知错误

时间:2016-07-15 06:19:54

标签: c opencl gpu gpgpu

我是opencl编程的新手,并尝试在C中创建以下“hello world”类型的OpenCL程序。

内核代码(hello.cl)

__kernel void hello(__global int* input,  __global int* output, const unsigned int count)
{

int i = get_global_id(0);
if(i < count)
    output[i] = input[i] * input[i]; 
}

主机代码:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define DATA_SIZE (10)

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MAX_SOURCE_SIZE (0x100000)

int main()
{
    size_t              count           = DATA_SIZE;
    cl_platform_id      platform_id     = NULL;
    cl_device_id        device_id       = NULL;
    cl_context          context         = NULL;
    cl_command_queue    command_queue   = NULL;
    cl_mem              memobj          = NULL;
    cl_program          program         = NULL;
    cl_kernel           kernel          = NULL;

    cl_uint             ret_num_devices;
    cl_uint             ret_num_platforms;
    cl_int              ret;

    size_t global;                       // local domain size for our calculation
    size_t local;                       // local domain size for our calculation

    FILE                *fp;
    char                fileName[] = "./hello.cl";
    char                *source_str;
    size_t              source_size;

    /* Load the source code containing the kernel*/
    fp = fopen(fileName, "r");
    if (!fp) 
    {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    source_str  = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    /* Get Platform and Device Info */
    ret             = clGetPlatformIDs      (1, &platform_id, &ret_num_platforms);

    ret             = clGetDeviceIDs        (platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
    if (ret != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    /* Create OpenCL context */
    context         = clCreateContext       (NULL, 1, &device_id, NULL, NULL, &ret);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }

    /* Create Command Queue */
    command_queue   = clCreateCommandQueue  (context, device_id, 0, &ret);
    if (!command_queue  )
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }


    /* Create Kernel Program from the source */
    program         = clCreateProgramWithSource (context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    } 

    /* Build Kernel Program */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    } 

    /* Create OpenCL Kernel */
    kernel          = clCreateKernel(program, "hello", &ret);
    if (!kernel || ret != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }     

    int data[DATA_SIZE];              // original data set given to device
    int results[DATA_SIZE];           // results returned from device
    int i = 0;
    for(i = 0; i < count; i++)
        data[i] = i+1;


    cl_mem input;                       // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(int) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    

    ret = clEnqueueWriteBuffer(command_queue, input, CL_TRUE, 0, sizeof(int) * count, data, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

    /* Set OpenCL Kernel Parameters */
    ret = 0;
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    ret |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (ret != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", ret);
        exit(1);
    }

    /* Execute OpenCL Kernel */
    ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (ret != CL_SUCCESS)
    {   
        printf("Error:  ! %d\n", ret);
        scanf("%d",&global);
        exit(1);
    }

    global = (size_t) count;
    ret             = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);

    clFinish(command_queue);

    /* Copy results from the memory buffer */
    ret = clEnqueueReadBuffer( command_queue, output, CL_TRUE, 0, sizeof(int) * count, results, 0, NULL, NULL );  
    if (ret != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", ret);
        exit(1);
    }

    // Validate our results
    //
    int correct = 0;
    for(i = 0; i < count; i++)
    {
        printf("%d-%d\n",data[i],results[i]);
        if(results[i] == data[i] * data[i])
            correct++;
    }

    // Print a brief summary detailing the results
    //
    printf("Computed '%d/%d' correct values!\n", correct, count);

    scanf("%d",&ret);
    /* Finalization */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(memobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    clReleaseMemObject(input);
    clReleaseMemObject(output);

    free(source_str);

    return 0;
}

这似乎是非常简单的代码,但是,结果数组包含所有零值。谁能告诉我我在这里犯的错误。该计划的输出如下:

1-0
2-0
3-0
4-0
5-0
6-0
7-0
8-0
9-0
10-0
Computed '0/10' correct values!

我无法找到任何因为gpu没有计算出正确值的原因。

1 个答案:

答案 0 :(得分:1)

您没有检查validate_on_submit的返回值,这几乎肯定是失败的,因为您无法确保全局尺寸是本地尺寸的整数倍。

查询clEnqueueNDRangeKernel的结果可能类似于256,但您的全局作业大小为10.您无法将包含10个项目的作业细分为256个组。