Question

我是并行编程的新手。我正在尝试在OpenCL中解决 PrefixSum 问题。但我的输出错了。所以在调试时我改变了我的内核来执行一些简单的操作。我正在使用AMD GPU的Windows8 64位机器。

这是我的内核代码 -

__kernel void add(__global float *input, __global float *output, __global float *temp)
{
    int thid = get_global_id(0);
    int pout = 0;
    int pin = 1;
    temp[pin*8 + thid] = input[thid];
    temp[pout*8 + thid] = input[thid];
    pout = 1-pout;
    pin = 1-pout; 
    int offset = 1;

    if(thid >= offset) { 
        temp[pout*8 + thid] =temp[pout*8 + thid] + temp[pin*8 + thid - offset];
    } else {
       temp[pout*8 + thid] = temp[pin*8 + thid];
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    output[thid] =  temp[pout*8 + thid];
}

这是我的主机代码 -

    int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;  
cl_mem inputA,inputB, output;
outfile.open("shubham.txt");
size_t global=8;

float inputDataA[DATA_SIZE]={1, 2, 3, 4, 5, 6, 7, 8};
float results[DATA_SIZE]={0};
float inputDataB[16] = {0};
float shubh[16] = {0};
int i;//,j;

//cl_int infoSize = 10000;
//size_t infoSize;
//char *info;
// retreive a list of platforms avaible
//cl_int p = ;


if(clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS)
{
    printf("Unable to get platform id\n");
    return 1;
}


// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
{
//  printf("shbham");
printf("Unable to get device_id\n");
return 1;
}

// context properties list - must be terminated with 0
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;

// create a context with the GPU device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);

// create command queue using the context and device
command_queue = clCreateCommandQueue(context, device_id, 0, &err);

// create a program from the kernel source code
program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err);

// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}

// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);

// create buffers for the input and ouput

inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 16, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);

// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * 16, inputDataB, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);

// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);

// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
clFinish(command_queue);

// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL);
clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);

// print the results
printf("output: ");

for(i=0;i<DATA_SIZE; i++)
{
printf("%f ",results[i]);
outfile << results[i] << endl;
}
for(i=0;i<16;i++)
{
outfile << shubh[i] <<" ";
}
// cleanup - release OpenCL resources
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}

我正在使用8个工作项运行此代码。输入是[1,2,3,4,5,6,7,8]，预期输出应该是[1,3,5,7,9,11,13,15]，但每次运行我的代码我得到了不同的输出，如[1,3,5,4,5,6,7,15]。似乎有些 thid 没有在＆＃34中更新其temp中的索引;如果＆＃34;条件。

如果问题是因为在添加＆＃34时没有使用atomic_add函数;如果＆＃34;条件然后应该是什么应该是将其更改为原子的语法，我自己尝试但在编译时遇到错误。

或者，如果还有其他问题，请帮我纠正。

PS。我正在使用DEVICE_TYPE_CPU运行我的代码，并且在使用DEVICE_TYPE_GPU时显示错误。我希望这不是问题的原因。

请帮忙

Answer 1

编辑：如果为本地工作组大小指定NULL，则让实现决定。我假设AMD实现选择1作为本地工作组大小，并且您有8个工作组，大小为1.因此，您有8个线程在临时阵列上执行数据争用。它是全局内存，因此它在工作组之间共享。这些障碍在这里没有帮助，因为您无法在OpenCL中的工作组之间进行同步，但您需要这样的同步。这也可以解释为什么你的代码在你指定本地worgroup大小为8时正常工作。然后你有1个工作组，屏障可以同步你的线程。

好的，看看你的内核：

__kernel void add(__global float *input, __global float *output, __global float *temp)
{
    int thid = get_global_id(0);
    int pout = 0;
    int pin = 1;
    temp[pin*8 + thid] = input[thid];
    temp[pout*8 + thid] = input[thid];
    pout = 1-pout;
    pin = 1-pout; 
    int offset = 1;

    if(thid >= offset) { 
        temp[pout*8 + thid] =temp[pout*8 + thid] + temp[pin*8 + thid - offset];
    } else {
       temp[pout*8 + thid] = temp[pin*8 + thid];
    }

    barrier(CLK_GLOBAL_MEM_FENCE);
    output[thid] =  temp[pout*8 + thid];
}

首先，我会删除额外的存储空间，因为它只是将数据复制两次，这是一个性能杀手，也可能是你的问题。（我不知道你运行内核的硬件，以及是否有像Nvidia GPU那样的隐式warp同步）。这里的问题（从并行编程的角度来看）是一个简单的竞争条件。在其他线程使用数据之前，您的线程尚未写入temp。两个解决方案：a）摆脱temp，b）在if语句之前放置一个屏障。但是，在OpenCL中，屏障只能同步同一工作组中的线程，因此如果您使用多个工作组，此内核可能会出现同样的问题。

由于您只是阅读输入和写入输出，因此您不需要temp：

__kernel void add(__global float *input, __global float *output, __global float *temp)
{
    int thid = get_global_id(0);
    int offset = 1;

    if(thid >= offset) { 
       output[thid] = input[thid] + input[thid - offset];
    } else {
       output[thid] = input[thid];
    }
}

这应该这样做。

在OpenCL中并行代码中输出错误

1 个答案: