opencl在不同的nvidia驱动程序上给出不同的结果?

时间:2014-03-04 10:34:46

标签: opencl

这是了解OpenCl功能的示例问题。

问题:我有三个数组 array1,array2,array3 。添加array2和array3的每个元素都保存到array1中。

示例:array1 [1] = array2 [1] + array3 [1];

array1,array2和array3是c程序中的全局double数组。这些数组初始化为值1.0;

然后使用opencl in buffer将这些数组传递给GPU。在opencl代码中,我使用了10个工作项,因此,每个工作项通过调用所需的函数来处理这些数组的每个元素。并更新缓冲区。使用array1,array2和array3数组的更新值读回更新的缓冲区。

将array1的更新值传递给array2和array3,并再次调用内核。再次评估。

因此应该得到结果:

loading kernel..
kernel loaded..
Step 0..
array1[0] = 2.000000
array1[1] = 2.000000
array1[2] = 2.000000
array1[3] = 2.000000
array1[4] = 2.000000
array1[5] = 2.000000
array1[6] = 2.000000
array1[7] = 2.000000
array1[8] = 2.000000
array1[9] = 2.000000
....
....
....
Step 10..
array1[0] = 18.000000
array1[1] = 18.000000
array1[2] = 18.000000
array1[3] = 18.000000
array1[4] = 18.000000
array1[5] = 18.000000
array1[6] = 18.000000
array1[7] = 18.000000
array1[8] = 18.000000
array1[9] = 18.000000
对于nVidia公司的GeForce GT 630(rev a1)显卡驱动程序版本为: 罚款<331>

但是如果我在 nVidia Corporation G96 [GeForce 9500 GT](rev a1)显卡驱动版本:260.19.26 中运行相同的代码。然后结果是错误的。 array1的实际值甚至没有变化

loading kernel..
kernel loaded..
Step 0..
array1[0] = 1.000000
array1[1] = 1.000000
array1[2] = 1.000000
array1[3] = 1.000000
array1[4] = 1.000000
array1[5] = 1.000000
array1[6] = 1.000000
array1[7] = 1.000000
array1[8] = 1.000000
array1[9] = 1.000000
....
....
....
Step 10..
array1[0] = 1.000000
array1[1] = 1.000000
array1[2] = 1.000000
array1[3] = 1.000000
array1[4] = 1.000000
array1[5] = 1.000000
array1[6] = 1.000000
array1[7] = 1.000000
array1[8] = 1.000000
array1[9] = 1.000000

为什么不同的显卡有不同的结果?

running.c代码:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MAX_SOURCE_SIZE (0x100000)

cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event event;
size_t source_size;
cl_mem array1Buffer, array2Buffer, array3Buffer;
size_t global_work_size[1] = {5};

#define size 10
double array1[size];
double array2[size];
double array3[size];


void create () {
    FILE *fp;
    char *source_str;
    fp = fopen("calc.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);


    /*Initialization*/
    /* Get Platform and Device Info */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL context */
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create Command Queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /*Initialization complete*/

    int i;
    for (i = 0; i< size ; i++) {
        array1[i] = 1.0;
        array2[i] = 1.0;
        array3[i] = 1.0;
    }

    /* Create Kernel Program from the source */
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);

    /* Build Kernel Program */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* Create OpenCL Kernel */
    kernel = clCreateKernel(program, "eval", &ret);
}

void eval_eq () {
    ret = clReleaseMemObject(array1Buffer);
    ret = clReleaseMemObject(array2Buffer);
    ret = clReleaseMemObject(array3Buffer);
    array1Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array1, NULL);
    ret = clEnqueueWriteBuffer(command_queue,
                                  array1Buffer,
                                  CL_FALSE,
                                  0,
                                  size * sizeof(double),
                                  array1,
                                  0,
                                  NULL,
                                  &event);
    ret = clWaitForEvents(1, &event);
    clReleaseEvent(event);
    array2Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array2, NULL);
    ret = clEnqueueWriteBuffer(command_queue,
                                  array2Buffer,
                                  CL_FALSE,
                                  0,
                                  size * sizeof(double),
                                  array2,
                                  0,
                                  NULL,
                                  &event);
    ret = clWaitForEvents(1, &event);
    clReleaseEvent(event);
    array3Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array3, NULL);
    ret = clEnqueueWriteBuffer(command_queue,
                                  array3Buffer,
                                  CL_FALSE,
                                  0,
                                  size * sizeof(double),
                                  array3,
                                  0,
                                  NULL,
                                  &event);
    ret = clWaitForEvents(1, &event);
    clReleaseEvent(event);
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&array1Buffer);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&array2Buffer);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&array3Buffer);
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
    ret = clEnqueueReadBuffer(command_queue, array1Buffer, CL_TRUE, 0, size * sizeof(double), array1, 0, NULL, NULL);
    ret = clEnqueueReadBuffer(command_queue, array2Buffer, CL_TRUE, 0, size * sizeof(double), array2, 0, NULL, NULL);
    ret = clEnqueueReadBuffer(command_queue, array3Buffer, CL_TRUE, 0, size * sizeof(double), array3, 0, NULL, NULL);
}


int main () {
    printf("loading kernel..\n");
    create();
    printf("kernel loaded..\n");
    int i, j;
    for (i = 0; i <= size; i++) {
        printf("Step %d..\n", i);
        eval_eq();
        for (j = 0; j < size; j++) {
            printf("array1[%d] = %lf\n", j, array1[j]);
            array2[j] = (double) i;
            array3[j] = (double) i;
        }
    }
    return 0;
}

calc.cl代码:

void sub_gp (__global double* ar1, __global double* ar2, __global double* ar3, int gpno) {
    ar1[gpno] = ar2[gpno] + ar3[gpno];
}

void gp (__global double* ar1, __global double* ar2, __global double* ar3, int gpno) {
    sub_gp(ar1,ar2,ar3,gpno);
}


__kernel void eval(__global double* ar1, __global double* ar2, __global double* ar3)
{
    int idx = get_global_id(0);
    gp(ar1,ar2,ar3,idx);
}

0 个答案:

没有答案