Question

我遇到了内核将数据写入错误位置或主机有时不正确地读取数据的问题。我将相同的数据（我写入数据的索引）写入两个不同类型的全局数组。为确保索引是正确的，使用全局计数器，该计数器通过atom_inc递增。从主机上的第二个阵列读取数据时出现问题。例如：

.....
output array index: 442: (output1 value:442.0000     output2 value:442) 
output array index: 443: (output1 value:443.0000     output2 value:443) 
output array index: 444: (output1 value:444.0000     output2 value:444) 
output array index: 445: (output1 value:445.0000     output2 value:445) 
output array index: 446: (output1 value:446.0000     output2 value:1152892928) 
output array index: 447: (output1 value:447.0000     output2 value:447) 
output array index: 448: (output1 value:448.0000     output2 value:1152909312) 
output array index: 449: (output1 value:449.0000     output2 value:1152917504) 
output array index: 450: (output1 value:450.0000     output2 value:1152925696)
......

正如您在标记446,448,449和450+中看到的那样，output2包含错误的值。可能的原因是什么？

设备：ATI Radeon HD5750

代码示例：

#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>

// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \

"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \

"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";

void art_process_sinogram(const char* tiff_filename,
                          const float *angles2,
                          const unsigned int n_angles2,
                          const unsigned int n_ray2s,
                          const float distanc2e)
{
  /******************************
   * OPENCL ENVIRONMENT
   */
  cl_int status;
  cl_uint numPlatforms = 0;
  cl_platform_id *platforms = NULL;
  cl_device_id device_id;

  //discover platforms
  status = clGetPlatformIDs(0, NULL, &numPlatforms);
  platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
  status = clGetPlatformIDs(numPlatforms, platforms, NULL);

  //discover devices
  cl_uint numDevices = 0;
  cl_device_id *devices = NULL;

  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
  devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
  status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
  device_id = devices[1];
  //create context
  cl_context context = NULL;
  context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

  cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
  clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  cl_kernel  kernel_weights = clCreateKernel(program, "kernel1", &status);

  //create queue
  cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);

  /******************************
   * HARDWARE PARAMETERS
   */
  cl_uint wavefronts_per_SIMD = 7;
  size_t global_work_size;
  size_t local_work_size = 64;

  cl_uint max_compute_units;

  clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);

  size_t wg_count = max_compute_units * wavefronts_per_SIMD;
  global_work_size = wg_count * local_work_size;

  /**************************** DATA PART *************************************/

  size_t w_portion_size = 768 * sizeof(cl_float);
  size_t w_pos_portion_size = 768 * sizeof(cl_uint);

  size_t counters_data_size = 6 * sizeof(cl_uint);
  cl_uint counters_data[6];
  counters_data[0] = 1;
  counters_data[1] = 2; // max number of the cells intersected by the ray
  counters_data[2] = 3;
  counters_data[3] = 4;
  counters_data[4] = 5; // same to the number of rays
  counters_data[5] = 0; // counter inside kernel

  /*****************
   * Main buffers
   */
  cl_mem weights1_buffer = clCreateBuffer(context,
                                          CL_MEM_READ_WRITE,
                                          w_portion_size,
                                          NULL,
                                          NULL);

  cl_mem weights_pos1_buffer = clCreateBuffer(context,
                                              CL_MEM_READ_WRITE,
                                              w_pos_portion_size,
                                              NULL,
                                              NULL);
  /*****************
   * Supplement buffers (constant)
   */
  cl_mem counters_data_buffer = clCreateBuffer(context,
                                               CL_MEM_READ_ONLY,
                                               counters_data_size,
                                               NULL,
                                               &status);


  cl_event supplement_buffer_ready[1];

  status = clEnqueueWriteBuffer(command_queue1,
                                counters_data_buffer,
                                CL_FALSE,
                                0,
                                counters_data_size,
                                counters_data,
                                0,
                                NULL,
                                &supplement_buffer_ready[0]);

  status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
  status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
  status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);

  status = clEnqueueNDRangeKernel(command_queue1,
                                  kernel_weights,
                                  1, // work dimensional 1D, 2D, 3D
                                  NULL, // offset
                                  &global_work_size, // total number of WI
                                  &local_work_size, // nomber of WI in WG
                                  1, // num events in wait list
                                  supplement_buffer_ready,  // event wait list
                                  NULL); // event

  clFinish(command_queue1);
  cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
                                                      weights1_buffer,//*pmain_weights_buffer,
                                                      CL_TRUE,
                                                      CL_MAP_READ,
                                                      0,
                                                      w_portion_size,
                                                      0, NULL, NULL, NULL);
  cl_uint *output2 = malloc(w_portion_size);
  status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
                               CL_TRUE, 0, w_pos_portion_size, output2,
                               0, NULL, NULL);

  clFinish(command_queue1);
  for(int i = 0; i < 790; ++i) {
    printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
  }
}

SOLUTION：

内核应该看起来像（需要检查索引）：

  __kernel void k_1(__global uint *counter, 
                    __global uint *weights, 
                    __global uint2 *weights_pos)
 {
    const uint global_size = get_global_size(0);
    const uint global_id = get_global_id(0);
    uint local_id = get_local_id(0);

    uint insert_index = atom_inc(&counter[5]);
    if(insert_index < 768) {
       weights[insert_index]= insert_index;
       weights_pos[insert_index].x = insert_index;
       weights_pos[insert_index].y = insert_index;
    }
}

Answer 1

你搞乱了缓冲区尺寸。

1）你的缓冲区各包含768个元素（参见w_portion_size和w_pos_portion_size的初始化）

2）我的机器上的工作组大小为896（参见wg_count的初始化）

3）您打印出790个值。

除此之外，还有一个概念错误：

if(global_id == 0) {
     counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]

您不能假设第一个虚拟处理器将在其他虚拟处理器之前执行此行。您应该完全删除此行，因为您在主机端初始化counter[5]。（我相信这是你问题的原因，但我无法重现这一点。）

修复这些问题后，您的代码似乎运行正常（英特尔实施）。

Answer 2

内核应该看起来像（需要检查索引）：

__kernel void k_1(__global uint *counter, 
                __global uint *weights, 
                __global uint2 *weights_pos)
{
   const uint global_size = get_global_size(0);
   const uint global_id = get_global_id(0);
   uint local_id = get_local_id(0);

   uint insert_index = atom_inc(&counter[5]);

   if(insert_index < 768) {
     weights[insert_index]= insert_index;
     weights_pos[insert_index].x = insert_index;
    weights_pos[insert_index].y = insert_index;
   }
}

在OpenCL上读取错误的数据

2 个答案: