如果将一个cl变量分配给另一个cl变量,则会在结果中填充不同数量的零,并且缺少某些值。如果在gcl_memcpy
的每次调用中都调用kernelBlock
,则即使gcl_memcpy
不在所需数组中,数据也会突然正确。是什么原因呢?
我希望将对内核的多个顺序调用的结果存储在GPU的内存中,并在最后执行单个gcl_memcpy
。每次使用gcl_memcpy
时都会调用kernelBlock
来限制性能。
square_kernel.cl
__kernel void square(__global float* input, __global float* output, __global float* mem_copy, __constant int* index)
{
size_t i = get_global_id(0);
output[i] = (input[i] * input[i]);
if (i == 16)
{
mem_copy[*index] = output[i];
}
}
Main.c
//------------------------------------------------------------------------------
#include <iostream>
#include <OpenCL/opencl.h>
#include "square_kernel.cl.h"
//------------------------------------------------------------------------------
#define NUM_VALUES 32
int main (int argc, const char * argv[])
{
dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);
if (queue == NULL)
queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
//--------------------------------------------------------------------------
int dataSize = sizeof(cl_float) * NUM_VALUES;
cl_float* test_in = new cl_float[sizeof(cl_float) * NUM_VALUES];
for (int i = 0; i < NUM_VALUES; i++)
test_in[i] = (cl_float)i;
//--------------------------------------------------------------------------
float* test_out = new float[dataSize];
__block void* mem_in = gcl_malloc(dataSize, test_in, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR);
__block void* mem_out = gcl_malloc(dataSize, NULL, CL_MEM_READ_WRITE);
__block void* mem_thru = gcl_malloc(sizeof(cl_float), NULL, CL_MEM_READ_WRITE);
__block void* index = gcl_malloc(sizeof(cl_int), NULL, CL_MEM_READ_WRITE);
//--------------------------------------------------------------------------
void (^kernelBlock)() =
^{
cl_ndrange range = // 6
{
1, // The number of dimensions to use.
{0, 0, 0}, // Offset in each dimension. To specify in the test case // 7
{NUM_VALUES,0, 0}, // global range: how many items IN TOTAL in each dimension to process.
{16, 0, 0} // Local size of each workgroup.
};
square_kernel(&range,
(cl_float*)mem_in,
(cl_float*)mem_out,
(cl_float*)mem_thru,
(cl_int*)index); // 8
(*(cl_int*)index)++;
};
for (int i = 0; i < 100; ++i)
dispatch_sync(queue, kernelBlock);
float* test_out = new float[100 * sizeof(cl_float)];
dispatch_sync(queue, ^{
gcl_memcpy(test_out, mem_thru, 100 * sizeof(cl_float));
});
for (int i = 0; i < 100; ++i)
std::cout << test_out[i] << '\n';
//--------------------------------------------------------------------------
gcl_free(mem_in);
gcl_free(mem_out);
gcl_free(mem_thru);
delete[] test_in;
delete[] test_out;
dispatch_release(queue);
//--------------------------------------------------------------------------
return 0;
}
控制台
0
0
0
0
256
256
0
0
256
256
256
0
0
256
256
0
0
0
256
256
256
0
0