我有一个OpenCL(1.2)内核,它接受一个constant
参数,这是一个子缓冲区。当我运行这个内核时,似乎使用了父缓冲区。如果我使用global const
参数,它会按预期工作。
我会把它归结为驱动程序错误,除了我可以在不同机器上的英特尔(Linux,beignet git)和nVidia(Linux,367.44-3)实现上重现它,这让我觉得我做了一个错误的地方。
以下是一个工作示例。预期输出为1, 1025, 1, 1025,
,而是打印1, 1, 1, 1025,
。
#include <CL/cl.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#define NELEMS(x) (sizeof(x) / sizeof(*x))
#define PLATFORM 0
#define DEVICE 0
const char src[] =
"kernel void test1(constant int * const a) {\n"
" size_t i = get_global_id(0);\n"
" if (i == 1)\n"
" printf(\"%i, \", a[i]);\n"
"}\n"
"\n"
"kernel void test2(global const int * const a) {\n"
" size_t i = get_global_id(0);\n"
" if (i == 1)\n"
" printf(\"%i, \", a[i]);\n"
"}\n";
const size_t src_len = sizeof(src);
const char * const kernels[] = {"test1", "test2"};
int main(void) {
cl_int err = -1;
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
assert(num_platforms > PLATFORM);
cl_platform_id * platforms = malloc(sizeof(*platforms) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_uint num_devices;
clGetDeviceIDs(platforms[PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
assert(num_devices >= DEVICE);
cl_device_id * devices = malloc(sizeof(*devices) * num_devices);
clGetDeviceIDs(platforms[PLATFORM], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_context_properties context_properties[] = {
CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[PLATFORM], 0
};
cl_context context = clCreateContext(context_properties, 1, &devices[DEVICE], NULL, NULL, &err);
assert(err == CL_SUCCESS);
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
cl_command_queue queue = clCreateCommandQueue(context, devices[DEVICE], 0, &err);
#pragma GCC diagnostic pop
assert(err == CL_SUCCESS);
cl_program program;
{
// Crashes if directly using src[]
char * source = malloc(src_len);
memcpy(source, src, src_len);
program = clCreateProgramWithSource(context, 1, (const char **) &source, &src_len, &err);
assert(err == CL_SUCCESS);
free(source);
}
err = clBuildProgram(program, 1, &devices[DEVICE], "", NULL, NULL);
assert(err == CL_SUCCESS);
size_t buffer_size = 8192;
size_t subbuffer_size = buffer_size / 2;
{
cl_uint align;
err = clGetDeviceInfo(devices[DEVICE], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(align), &align, NULL);
assert(err == CL_SUCCESS);
assert(subbuffer_size % align == 0);
cl_ulong constbuf_size;
err = clGetDeviceInfo(devices[DEVICE], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(constbuf_size), &constbuf_size, NULL);
assert(err == CL_SUCCESS);
assert(constbuf_size > subbuffer_size);
}
cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
cl_mem sub_buffers[2];
for (size_t i = 0; i < NELEMS(sub_buffers); i++){
cl_buffer_region region = {
.origin = i * subbuffer_size,
.size = subbuffer_size,
};
sub_buffers[i] = clCreateSubBuffer(buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
assert(err == CL_SUCCESS);
}
{
cl_int * data = clEnqueueMapBuffer(queue, buffer, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, 0, NULL, NULL, &err);
assert(err == CL_SUCCESS);
for (size_t i = 0; i < buffer_size / sizeof(cl_int); i++)
data[i] = i;
cl_event unmap_event;
err = clEnqueueUnmapMemObject(queue, buffer, data, 0, NULL, &unmap_event);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &unmap_event);
assert(err == CL_SUCCESS);
}
for (size_t k = 0; k < NELEMS(kernels); k++) {
cl_kernel kernel = clCreateKernel(program, kernels[k], &err);
assert(err == CL_SUCCESS);
cl_event run_event;
for (size_t i = 0; i < NELEMS(sub_buffers); i++){
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &sub_buffers[i]);
assert(err == CL_SUCCESS);
size_t work_size[] = {subbuffer_size / sizeof(cl_int)};
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, work_size, NULL, 0, NULL, &run_event);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &run_event);
assert(err == CL_SUCCESS);
err = clFinish(queue);
assert(err == CL_SUCCESS);
}
clReleaseKernel(kernel);
}
puts("");
for (size_t i = 0; i < NELEMS(sub_buffers); i++)
clReleaseMemObject(sub_buffers[i]);
clReleaseMemObject(buffer);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(devices);
free(platforms);
return 0;
}
答案 0 :(得分:0)
这很有趣。我尝试在不同的设备上,在MacBookPro中有3个设备包括Nvidia IRIS和Intel,都得到正确的输出。在带有Nvidia驱动程序的MBP中的Windows 10中,输出完全相同。 我认为这是一个Nvidia错误,但不限于Nvidia。