Question

我试图通过编写一个在输入缓冲区中找到值的平方的内核来学习OpenCL中的clEnqueueMapBuffer，但是只使用clEnqueueMapBuffer在输出缓冲区中一次返回两个项目。据我了解，此函数返回主机内存中的指针，指针指向设备中的缓冲区内存。然后clEnqueueUnmapMemObject必须取消映射此缓冲区以允许内核继续其计算。现在，当我调用clEnqueueMapBuffer时，它返回随机数据。

这是我的内核

__kernel void testStream(
    __global int *input_vector,
    __global int *output_vector,
    __global int *mem_flag) // informs the host when the workload is finished
{
    mem_flag[0] = 1;
}

和我的来源

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <CL/opencl.h>

#include "utils.h"

int main(void)
{
    unsigned int n = 24;
    int BUFF_SIZE = 2;
    // Input and output vectors
    int num_bytes = sizeof(int) * n;
    int *output_buffer = (int *) malloc(num_bytes);
    int output_buffer_offset = 0;
    int *mapped_data = NULL;
    // use mapped_flag for determining if the job on the device is finished
    int *mapped_flag = NULL;
    int *host_in = (int *) malloc(num_bytes);
    int *host_out = (int *) malloc(num_bytes);
    // Declare cl variables
    cl_mem device_in;
    cl_mem device_out;
    cl_mem device_out_flag;

    // Declare cl boilerplate
    cl_platform_id   platform = NULL;
    cl_device_id     device   = NULL;
    cl_command_queue queue    = NULL;
    cl_context       context  = NULL;
    cl_program       program  = NULL;
    cl_kernel        kernel   = NULL;

    // Located in utils.c -- the source is irrelevant here
    char *kernel_source = read_kernel("kernels/test.cl");

    // Initialize host_in
    int i;
    for (i = 0; i < n; i++) {
        host_in[i] = i + 1;
    }

    // Set up opencl
    cl_int error;
    error   = clGetPlatformIDs(1, &platform, NULL);
    printf("clGetPlatformIDs: %d\n", (int) error);
    error   = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    printf("clGetDeviceIDs: %d\n", (int) error);
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    printf("clCreateContext: %d\n", (int) error);
    queue   = clCreateCommandQueue(context, device, 0, &error);
    printf("clCreateCommandQueue: %d\n", (int) error);
    program = clCreateProgramWithSource(context, 1,
        (const char**)&kernel_source, NULL, &error);
    printf("clCreateProgramWithSource: %d\n", (int) error);
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    kernel = clCreateKernel(program, "testStream", &error);
    printf("clCreateKernel: %d\n", (int) error);
    // Create the buffers
    device_in = clCreateBuffer(context, CL_MEM_READ_ONLY,
        num_bytes, NULL, NULL);
    device_out = clCreateBuffer(context,
        CL_MEM_WRITE_ONLY,
        sizeof(int) * BUFF_SIZE, NULL, NULL);
    device_out_flag = clCreateBuffer(context,
        CL_MEM_WRITE_ONLY,
        sizeof(int) * 2, NULL, NULL);

    // Write the input buffer
    clEnqueueWriteBuffer(
        queue, device_in, CL_FALSE, 0, num_bytes,
        host_in, 0, NULL, NULL);

    // Set the kernel arguments
    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &device_in);
    error = clSetKernelArg(kernel, 1, sizeof(cl_mem), &device_out);
    error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &device_out_flag);

    // Execute the kernel over the entire range of data
    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, 
        (const size_t *) &n, NULL, 0, NULL, NULL);

    // Map and unmap until the flag is set to true
    int break_flag = 0;
    while(1) {
        // Map the buffers
        mapped_data = (int *) clEnqueueMapBuffer(
            queue, device_out, CL_TRUE, CL_MAP_READ, 0,
            sizeof(int) * BUFF_SIZE, 0, NULL, NULL, &error);
        mapped_flag = (int *) clEnqueueMapBuffer(
            queue, device_out_flag, CL_TRUE, CL_MAP_READ, 0,
            sizeof(int) , 0, NULL,NULL, &error);
        // Extract the data out of the buffer
        printf("mapped_flag[0] = %d\n", mapped_flag[0]);
        // Set the break_flag
        break_flag = mapped_flag[0];
        // Unmap the buffers
        error = clEnqueueUnmapMemObject(queue, device_out, mapped_data, 0,
            NULL, NULL);
        error = clEnqueueUnmapMemObject(queue, device_out_flag, mapped_flag,
            0, NULL, NULL);
        if (break_flag == 1) {break;}
        usleep(1000*1000);
    }

   return 0;
}

当我运行程序时，我得到的输出类似于

clGetPlatformIDs: 0
clGetDeviceIDs: 0
clCreateContext: 0
clCreateCommandQueue: 0
clCreateProgramWithSource: 0
clCreateKernel: 0
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144

为什么会这样？

修改

我在内核3.13.7-100.fc19.x86_64上使用带有fedora 19 64位的HP dm1z运行此代码。这是clinfo的输出

Number of platforms:                 1
  Platform Profile:              FULL_PROFILE
  Platform Version:              OpenCL 1.2 AMD-APP (1214.3)
  Platform Name:                 AMD Accelerated Parallel Processing
  Platform Vendor:               Advanced Micro Devices, Inc.
  Platform Extensions:               cl_khr_icd cl_amd_event_callback cl_amd_offline_devices


  Platform Name:                 AMD Accelerated Parallel Processing
Number of devices:               2
  Device Type:                   CL_DEVICE_TYPE_GPU
  Device ID:                     4098
  Board name:                    AMD Radeon HD 6310 Graphics
  Device Topology:               PCI[ B#0, D#1, F#0 ]
  Max compute units:                 2
  Max work items dimensions:             3
    Max work items[0]:               256
    Max work items[1]:               256
    Max work items[2]:               256
  Max work group size:               256
  Preferred vector width char:           16
  Preferred vector width short:          8
  Preferred vector width int:            4
  Preferred vector width long:           2
  Preferred vector width float:          4
  Preferred vector width double:         0
  Native vector width char:          16
  Native vector width short:             8
  Native vector width int:           4
  Native vector width long:          2
  Native vector width float:             4
  Native vector width double:            0
  Max clock frequency:               492Mhz
  Address bits:                  32
  Max memory allocation:             134217728
  Image support:                 Yes
  Max number of images read arguments:       128
  Max number of images write arguments:      8
  Max image 2D width:                16384
  Max image 2D height:               16384
  Max image 3D width:                2048
  Max image 3D height:               2048
  Max image 3D depth:                2048
  Max samplers within kernel:            16
  Max size of kernel argument:           1024
  Alignment (bits) of base address:      2048
  Minimum alignment (bytes) for any datatype:    128
  Single precision floating point capability
    Denorms:                     No
    Quiet NaNs:                  Yes
    Round to nearest even:           Yes
    Round to zero:               Yes
    Round to +ve and infinity:           Yes
    IEEE754-2008 fused multiply-add:         Yes
  Cache type:                    None
  Cache line size:               0
  Cache size:                    0
  Global memory size:                201326592
  Constant buffer size:              65536
  Max number of constant args:           8
  Local memory type:                 Scratchpad
  Local memory size:                 32768
  Kernel Preferred work group size multiple:     32
  Error correction support:          0
  Unified memory for Host and Device:        1
  Profiling timer resolution:            1
  Device endianess:              Little
  Available:                     Yes
  Compiler available:                Yes
  Execution capabilities:                
    Execute OpenCL kernels:          Yes
    Execute native function:             No
  Queue properties:              
    Out-of-Order:                No
    Profiling :                  Yes
  Platform ID:                   0x00007fd434852fc0
  Name:                      Loveland
  Vendor:                    Advanced Micro Devices, Inc.
  Device OpenCL C version:           OpenCL C 1.2 
  Driver version:                1214.3
  Profile:                   FULL_PROFILE
  Version:                   OpenCL 1.2 AMD-APP (1214.3)
  Extensions:                    cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_atomic_counters_32 cl_amd_device_attribute_query cl_amd_vec3 cl_amd_printf cl_amd_media_ops cl_amd_media_ops2 cl_amd_popcnt cl_amd_image2d_from_buffer_read_only

另外，值得注意的是，当我开始玩OpenCL时，我运行了一个测试程序来计算内部产品，但这给出了奇怪的结果。最初我虽然这是程序错误而忘记了它，但OpenCL实现是否有可能出错？如果它有帮助，OpenGL实现会有多个错误，导致随机数据块出现在我的桌面背景上，但这也可能是Linux问题。

Answer 1

您将NULL作为全球工作规模传递给clEnqueueNDRangeKernel来电：

// Execute the kernel over the entire range of data
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, NULL, NULL, 0, NULL, NULL);

如果您正在检查此调用返回的错误代码（总是应该），则会收到与CL_INVALID_GLOBAL_WORK_SIZE对应的错误代码。您始终需要指定全局工作大小，因此您的调用应如下所示：

// Execute the kernel over the entire range of data
size_t global[1] = {1};
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
// check error == CL_SUCCESS!

您对map和unmap缓冲区的调用很好;我已使用上述修复程序对此代码进行了测试，它对我有用。

修复上述问题的更新代码如下所示：

unsigned int n = 24;
...
// Execute the kernel over the entire range of data
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, 
    (const size_t *) &n, NULL, 0, NULL, NULL);

这不是将全局工作大小参数传递给内核的安全方法。例如，unsigned int n变量可能占用32位，而size_t可能是64位。这意味着当您传递n的地址并转换为const size_t*时，实现将读取64位值，该值将包含32位n加上32位其他位有一些任意的价值。您应该将n分配给size_t变量，然后再将其传递给clEnqueueNDRangeKernel，或者将其更改为size_t本身。

这可能与您遇到的问题有关，也可能与之无关。例如，您可能无意中启动了巨大的数量的工作项，这可能解释了为什么代码似乎在CPU上阻塞。

Answer 2

以下是我的一些评论：

我觉得你认为调用clEnqueueMapBuffer将会中断内核的执行。我不认为这是正确的。一旦命令启动执行，它就会一直运行直到它完成（或失败......）。可以同时启动多个命令，但是在内核仍处理它时尝试读取某些数据将导致未定义的行为。此外，创建命令队列的方式不允许您同时运行多个命令。创建队列时需要使用CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE属性才能允许。
我不知道在@jprice发布之后您调用内核的全局大小，但除非您只使用一个工作项执行内核，否则您将无法使用此类语句：mem_flag[0] = 1;因为所有的工作项都会写到同一个地方。（我猜你只发布了你内核的一部分。检查你是否有其他类似的声明......如果你发布整个内核代码，那么它实际上很有用。）
由于你总是映射和取消映射缓冲区的相同部分，并且总是去检查第一个元素（mapped_flag）并且由于内核已经完成了那个时刻的计算（参见第一点），至少它是正常的您总是读取相同的值。

为什么clEnqueueMapBuffer会返回随机数据？

修改

2 个答案: