这是了解OpenCl功能的示例问题。
问题:我有三个数组 array1,array2,array3 。添加array2和array3的每个元素都保存到array1中。
示例:array1 [1] = array2 [1] + array3 [1];
array1,array2和array3是c程序中的全局double数组。这些数组初始化为值1.0;
然后使用opencl in buffer将这些数组传递给GPU。在opencl代码中,我使用了10个工作项,因此,每个工作项通过调用所需的函数来处理这些数组的每个元素。并更新缓冲区。使用array1,array2和array3数组的更新值读回更新的缓冲区。
将array1的更新值传递给array2和array3,并再次调用内核。再次评估。
因此应该得到结果:
loading kernel..
kernel loaded..
Step 0..
array1[0] = 2.000000
array1[1] = 2.000000
array1[2] = 2.000000
array1[3] = 2.000000
array1[4] = 2.000000
array1[5] = 2.000000
array1[6] = 2.000000
array1[7] = 2.000000
array1[8] = 2.000000
array1[9] = 2.000000
....
....
....
Step 10..
array1[0] = 18.000000
array1[1] = 18.000000
array1[2] = 18.000000
array1[3] = 18.000000
array1[4] = 18.000000
array1[5] = 18.000000
array1[6] = 18.000000
array1[7] = 18.000000
array1[8] = 18.000000
array1[9] = 18.000000
对于nVidia公司的GeForce GT 630(rev a1)显卡驱动程序版本为: 罚款<331>
但是如果我在 nVidia Corporation G96 [GeForce 9500 GT](rev a1)显卡驱动版本:260.19.26 中运行相同的代码。然后结果是错误的。 array1的实际值甚至没有变化
loading kernel..
kernel loaded..
Step 0..
array1[0] = 1.000000
array1[1] = 1.000000
array1[2] = 1.000000
array1[3] = 1.000000
array1[4] = 1.000000
array1[5] = 1.000000
array1[6] = 1.000000
array1[7] = 1.000000
array1[8] = 1.000000
array1[9] = 1.000000
....
....
....
Step 10..
array1[0] = 1.000000
array1[1] = 1.000000
array1[2] = 1.000000
array1[3] = 1.000000
array1[4] = 1.000000
array1[5] = 1.000000
array1[6] = 1.000000
array1[7] = 1.000000
array1[8] = 1.000000
array1[9] = 1.000000
为什么不同的显卡有不同的结果?
running.c代码:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event event;
size_t source_size;
cl_mem array1Buffer, array2Buffer, array3Buffer;
size_t global_work_size[1] = {5};
#define size 10
double array1[size];
double array2[size];
double array3[size];
void create () {
FILE *fp;
char *source_str;
fp = fopen("calc.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/*Initialization*/
/* Get Platform and Device Info */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/*Initialization complete*/
int i;
for (i = 0; i< size ; i++) {
array1[i] = 1.0;
array2[i] = 1.0;
array3[i] = 1.0;
}
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "eval", &ret);
}
void eval_eq () {
ret = clReleaseMemObject(array1Buffer);
ret = clReleaseMemObject(array2Buffer);
ret = clReleaseMemObject(array3Buffer);
array1Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array1, NULL);
ret = clEnqueueWriteBuffer(command_queue,
array1Buffer,
CL_FALSE,
0,
size * sizeof(double),
array1,
0,
NULL,
&event);
ret = clWaitForEvents(1, &event);
clReleaseEvent(event);
array2Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array2, NULL);
ret = clEnqueueWriteBuffer(command_queue,
array2Buffer,
CL_FALSE,
0,
size * sizeof(double),
array2,
0,
NULL,
&event);
ret = clWaitForEvents(1, &event);
clReleaseEvent(event);
array3Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array3, NULL);
ret = clEnqueueWriteBuffer(command_queue,
array3Buffer,
CL_FALSE,
0,
size * sizeof(double),
array3,
0,
NULL,
&event);
ret = clWaitForEvents(1, &event);
clReleaseEvent(event);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&array1Buffer);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&array2Buffer);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&array3Buffer);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, array1Buffer, CL_TRUE, 0, size * sizeof(double), array1, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, array2Buffer, CL_TRUE, 0, size * sizeof(double), array2, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, array3Buffer, CL_TRUE, 0, size * sizeof(double), array3, 0, NULL, NULL);
}
int main () {
printf("loading kernel..\n");
create();
printf("kernel loaded..\n");
int i, j;
for (i = 0; i <= size; i++) {
printf("Step %d..\n", i);
eval_eq();
for (j = 0; j < size; j++) {
printf("array1[%d] = %lf\n", j, array1[j]);
array2[j] = (double) i;
array3[j] = (double) i;
}
}
return 0;
}
calc.cl代码:
void sub_gp (__global double* ar1, __global double* ar2, __global double* ar3, int gpno) {
ar1[gpno] = ar2[gpno] + ar3[gpno];
}
void gp (__global double* ar1, __global double* ar2, __global double* ar3, int gpno) {
sub_gp(ar1,ar2,ar3,gpno);
}
__kernel void eval(__global double* ar1, __global double* ar2, __global double* ar3)
{
int idx = get_global_id(0);
gp(ar1,ar2,ar3,idx);
}