这是某种并行缩减/极值内核的一部分。我已经将它减少到仍然会clBuildProgram
崩溃的最小代码(注意它确实崩溃了,并且不只是返回错误代码):
编辑:当local_value
宣布为global
而不是local
时,似乎也会发生这种情况。
EDIT2 / SOLUTION :问题是存在无限循环。我应该写remaining_items >>= 1
而不是remaining_items >> 1
。正如在答案中所说的,当涉及到编译/优化错误时,nvidia编译器看起来不是很强大。
kernel void testkernel(local float *local_value)
{
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1)
{
// throw away the right half of the threads
remaining_items >> 1; // <-- SPOTTED THE BUG
if (thread_id > remaining_items)
{
return;
}
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
{
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
删除行return;
和/或local_value[thread_id] = right_value;
会导致clBuildProgram成功完成。
我可以在我的所有计算机上重现这个问题(NVIDIA GTX 560,GT 555M,GT 540M,它们都是Fermi 2.1架构)。当使用x64或x86库时,NVIDIA CUDA Toolkit SDK版本4.0,4.1和4.2显而易见。
有没有人知道可能是什么问题?
是否可以自动假设本地(也称为共享)内存为(WORK_GROUP_SIZE) * siezof(its_base_type)
?这可以解释为什么当我上面提到的线被删除时它才起作用。
用于复制的最小主机代码(C99兼容):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
int main(int argc, char **argv)
{
// Load the kernel source code into the array source_str
FILE *fp;
fp = fopen("testkernel.cl", "rb");
if (!fp)
{
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
int filesize = ftell(fp);
rewind(fp);
char *source_str = (char*)calloc(filesize, sizeof(char));
size_t bytes_read = fread(source_str, 1, filesize, fp);
source_str[bytes_read] = 0;
fclose(fp);
// Get platform information
cl_uint num_platforms;
RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));
cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));
cl_device_id selected_device_id = NULL;
printf("available platforms:\n");
for (cl_uint i = 0; i < num_platforms; i++)
{
char platform_name[50];
RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
printf("%s\n", platform_name);
// get devices for this platform
cl_uint num_devices;
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));
// select first nvidia device
if (strstr(platform_name, "NVIDIA")) // ADAPT THIS ACCORDINGLY
{
selected_device_id = device_ids[0];
}
}
if (selected_device_id == NULL)
{
printf("No NVIDIA device found\n");
exit(1);
}
// Create an OpenCL context
cl_context context;
REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));
// Create a program from the kernel source
cl_program program;
REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));
// Build the program
cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
if (ret)
{
printf("BUILD ERROR\n");
// build error - get build log and display it
size_t build_log_size;
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
char *build_log = new char[build_log_size];
ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
printf("%s\n", build_log);
exit(1);
}
printf("build finished successfully\n");
return 0;
}
答案 0 :(得分:1)
根据我的经验,nvidia编译器在处理构建错误方面不是很强大,所以你可能在某处有编译错误。
我认为您的问题确实是return
,或更多与barrier
的组合。根据关于障碍的opencl规范:
在处理器上执行内核的工作组中的所有工作项 必须先执行此功能才能允许继续 执行超越障碍。所有人都必须遇到这个功能 执行内核的工作组中的工作项。
如果障碍在条件语句中,则所有工作项必须输入 如果有任何工作项输入条件语句和 执行障碍。
如果barrer在循环内,则所有工作项 必须在循环之前为循环的每次迭代执行屏障 允许继续执行超越障碍。
所以我认为你的问题可能是很多线程在进入障碍之前会返回,这使得这些代码无效。也许你应该尝试这样的事情:
kernel void testkernel(local float *local_value) {
size_t thread_id = get_local_id(0);
int remaining_items = 1024;
while (remaining_items > 1) {
remaining_items >>= 1;// throw away the right half of the threads
if (thread_id <= remaining_items) {
// look for a greater value in the right half of the memory space
int right_index = thread_id + remaining_items;
float right_value = local_value[right_index];
if (right_value > local_value[thread_id])
local_value[thread_id] = right_value;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
修改此外,正如评论中所述,它必须是remaining_items>>=1
而不是remaining_items>>1
,以避免产生无限循环。