我正在尝试学习OpenCL,目前我正在练习制作不同的内核。在我尝试使用缩减方法制作点积时,我遇到了一个我不明白的问题。当我使用int输入和输出运行我的代码时,它工作正常。当我将所有int类型更改为float类型(对于输入和输出)时,它给出了一个接近但稍微偏离的结果。任何人都可以弄清楚这是什么或导致它的原因吗?
这是主持人代码
#include <stdio.h>
#include <CL/cl.h>
#include <string.h>
#include <stdlib.h>
/*
* IF ERROR, PRINT OUT OPENCL ERROR CODE
*/
void errorCheck(cl_int error){
if(error < CL_SUCCESS){
printf("Failed - OpenCL error #: %d\n", error);
//exit(1);
}
}
int main(){
cl_int err;
cl_uint numPlatforms = 0;
cl_uint numDevices = 0;
cl_platform_id * platformIDs = NULL;
cl_device_id * deviceIDs = NULL;
cl_context context = NULL;
cl_command_queue cmdQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
FILE *program_handle;
char *program_buffer;
size_t program_size;
cl_mem a_buff = NULL;
cl_mem b_buff = NULL;
cl_mem c_buff = NULL;
int * a = NULL;
int * b = NULL;
int * c = NULL;
int correct_dot = 0;
int ELEMENTS = 1024;
const int datasize = sizeof(int)*ELEMENTS;
a = (int *)malloc(datasize);
b = (int *)malloc(datasize);
c = (int *)malloc(datasize);
int i;
for(i = 0; i<ELEMENTS; i++){
a[i] = i;
b[i] = i;
correct_dot += (int)i*i;
}
/*
**RETRIEVE NUMBER OF PLATFORMS
*/
err = clGetPlatformIDs(0, NULL, &numPlatforms);
errorCheck(err);
/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF PLATFORMS
*/
platformIDs = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
errorCheck(err);
/*
**RETRIEVE NUMBER OF DEVICES
*/
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
errorCheck(err);
/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF DEVICES
*/
deviceIDs = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, deviceIDs, NULL);
errorCheck(err);
/*
**GET INFO ON DEVICE
*/
cl_uint * info;
size_t info_size;
err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, 0, NULL, &info_size);
info = (cl_uint *) malloc(info_size);
err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, info_size, info, NULL);
errorCheck(err);
/*
**CREATE CONTEXT
*/
context = clCreateContext(NULL, numDevices, deviceIDs, NULL, NULL, &err);
errorCheck(err);
/*
**CREATE COMMAND QUEUE
*/
cmdQueue = clCreateCommandQueue(context, deviceIDs[0], 0, &err);
errorCheck(err);
/*
**CREATE BUFFERS
*/
a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, a, &err);
errorCheck(err);
b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, b, &err);
errorCheck(err);
c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, datasize, c, &err);
errorCheck(err);
err = clEnqueueWriteBuffer(cmdQueue, a_buff, CL_TRUE, 0, datasize, a, 0, NULL, NULL);
errorCheck(err);
err = clEnqueueWriteBuffer(cmdQueue, b_buff, CL_TRUE, 0, datasize, b, 0, NULL, NULL);
errorCheck(err);
/*
**READ PROGRAM FROM FILE
*/
program_handle = fopen("kernels/mvdot.cl", "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/*
**CREATE PROGRAM
*/
program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err);
errorCheck(err);
/*
**BUILD PROGRAM
*/
err = clBuildProgram(program, 1, deviceIDs, NULL, NULL, NULL);
errorCheck(err);
if (err == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
/*
**CREATE KERNEL
*/
kernel = clCreateKernel(program, "mvdot", &err);
errorCheck(err);
/*
**SET UP KERNEL ARGS
*/
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a_buff); //input A
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b_buff); //input B
err |= clSetKernelArg(kernel, 2, datasize, NULL); //local scratch
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &c_buff); //output C
err |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &ELEMENTS); //size
errorCheck(err);
/*
**ENQUEUE KERNEL FOR EXECUTION
*/
const size_t global_work_size = ELEMENTS;
const size_t local_work_size = info[0];
err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
errorCheck(err);
/*
**READ OUTPUT
*/
err = clEnqueueReadBuffer(cmdQueue, c_buff, CL_TRUE, 0, datasize, c, 0, NULL, NULL);
errorCheck(err);
/*
**CHECK OUTPUT
*/
int our_dot = 0;
printf("C = [");
//elements/info[0] gives # of work groups
for(i = 0; i<ELEMENTS; i++){
printf(" %d ", c[i]);
our_dot += c[i];
}
printf("]\n");
printf("Our dot = %d\n", our_dot);
printf("Correct dot = %d\n", correct_dot);
if(our_dot == correct_dot){
printf("PASSED\n");
} else {
printf("FAILED\n");
}
/*
**FREE RESOURCES
*/
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);
clReleaseMemObject(a_buff);
clReleaseMemObject(b_buff);
clReleaseMemObject(c_buff);
free(platformIDs);
free(deviceIDs);
free(program_buffer);
free(a);
free(b);
free(c);
}
这是内核代码。
__kernel void mvdot(
__global int* a,
__global int* b,
__local int* scratch,
__global int* c,
__const int size)
{
int g_id = get_global_id(0);
int l_id = get_local_id(0);
int local_size = get_local_size(0);
int b_id = get_group_id(0);
//move the product of the two inputs to scratch
//if our thread is outside of the range of the inputs
//pad the scratch with 0
scratch[l_id] = (g_id < size) ? (a[g_id] * b[g_id]) : 0;
//Use local memory to compute the sum of the products
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = 2; i<=local_size; i*=2) {
int offset = local_size/i;
scratch[l_id] = (l_id < offset) ? (scratch[l_id] + scratch[l_id + offset]) : scratch[l_id];
}
barrier(CLK_LOCAL_MEM_FENCE);
//Move the final sum for each group to the output
if(l_id == 0){
c[b_id] = scratch[0];
}
}
我确定这是非常愚蠢的事情,但我只是不理解它。
答案 0 :(得分:1)
在本地内存减少中存在竞争条件,因为不同的工作项在连续循环迭代中读取和写入相同的位置。在每次迭代时都需要一个屏障,并且还应避免在不执行任何工作时重写值(因为这会与尝试读取该位置的其他工作项冲突)。
编写这种工作组缩减的典型方法看起来像这样:
//Use local memory to compute the sum of the products
for (int offset = local_size/2; offset > 0; offset/=2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (l_id < offset)
scratch[l_id] = scratch[l_id] + scratch[l_id + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
如果您知道正在执行的硬件的SIMD宽度,则可以展开循环的最后几次迭代并在没有障碍的情况下执行它们(如果工作项在锁定步骤中执行,则不再需要同步)。
正如@DarkZeros在评论中指出的那样,罪魁祸首就是浮点的精确度(虽然上面的竞争条件仍应处理)。您可以通过将数据类型更改为double
来验证这一点 - 这样做会导致代码在我的GPU上通过测试。