OpenCL用内部变量赋值填充了从内核返回的零值

时间:2019-08-22 14:32:00

标签: c++ c macos opencl

问题

如果将一个cl变量分配给另一个cl变量,则会在结果中填充不同数量的零,并且缺少某些值。如果在gcl_memcpy的每次调用中都调用kernelBlock,则即使gcl_memcpy不在所需数组中,数据也会突然正确。是什么原因呢?

背景

我希望将对内核的多个顺序调用的结果存储在GPU的内存中,并在最后执行单个gcl_memcpy。每次使用gcl_memcpy时都会调用kernelBlock来限制性能。

示例代码

square_kernel.cl

__kernel void square(__global float* input, __global float* output, __global float* mem_copy, __constant int* index)
{
    size_t i = get_global_id(0);

    output[i] = (input[i] * input[i]);
    if (i == 16)
    {
        mem_copy[*index] = output[i];
    }
}

Main.c

//------------------------------------------------------------------------------
#include <iostream>
#include <OpenCL/opencl.h>
#include "square_kernel.cl.h"
//------------------------------------------------------------------------------
#define NUM_VALUES 32
int main (int argc, const char * argv[])
{
    dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);
    if (queue == NULL)
        queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
    //--------------------------------------------------------------------------
    int dataSize = sizeof(cl_float) * NUM_VALUES;
    cl_float* test_in = new cl_float[sizeof(cl_float) * NUM_VALUES];
    for (int i = 0; i < NUM_VALUES; i++)
        test_in[i] = (cl_float)i;

    //--------------------------------------------------------------------------
    float* test_out = new float[dataSize];
    __block void* mem_in  = gcl_malloc(dataSize, test_in, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
    __block void* mem_out = gcl_malloc(dataSize, NULL, CL_MEM_READ_WRITE);
    __block void* mem_thru  = gcl_malloc(sizeof(cl_float), NULL, CL_MEM_READ_WRITE);
    __block void* index  = gcl_malloc(sizeof(cl_int), NULL, CL_MEM_READ_WRITE);
    //--------------------------------------------------------------------------
    void (^kernelBlock)() =
        ^{
        cl_ndrange range =  // 6
        {
            1,                  // The number of dimensions to use.
            {0, 0, 0},          // Offset in each dimension.  To specify in the test case // 7
            {NUM_VALUES,0, 0}, // global range: how many items IN TOTAL in each dimension to process.
            {16, 0, 0}         // Local size of each workgroup.
        };
        square_kernel(&range,
                      (cl_float*)mem_in,
                      (cl_float*)mem_out,
                      (cl_float*)mem_thru,
                      (cl_int*)index); // 8
        (*(cl_int*)index)++;
    };

    for (int i = 0; i < 100; ++i)
        dispatch_sync(queue, kernelBlock);

    float* test_out = new float[100 * sizeof(cl_float)];
    dispatch_sync(queue, ^{
        gcl_memcpy(test_out, mem_thru, 100 * sizeof(cl_float));
    });

    for (int i = 0; i < 100; ++i)
        std::cout << test_out[i] << '\n';
    //--------------------------------------------------------------------------
    gcl_free(mem_in);
    gcl_free(mem_out);
    gcl_free(mem_thru);
    delete[] test_in;
    delete[] test_out;
    dispatch_release(queue);
    //--------------------------------------------------------------------------
    return 0;
}

控制台

0
0
0
0
256
256
0
0
256
256
256
0
0
256
256
0
0
0
256
256
256
0
0

0 个答案:

没有答案