Question

这是某种并行缩减/极值内核的一部分。我已经将它减少到仍然会clBuildProgram崩溃的最小代码（注意它确实崩溃了，并且不只是返回错误代码）：

编辑：当local_value宣布为global而不是local时，似乎也会发生这种情况。

EDIT2 / SOLUTION ：问题是存在无限循环。我应该写remaining_items >>= 1而不是remaining_items >> 1。正如在答案中所说的，当涉及到编译/优化错误时，nvidia编译器看起来不是很强大。

kernel void testkernel(local float *local_value)
{
    size_t thread_id = get_local_id(0);

    int remaining_items = 1024;

    while (remaining_items > 1)
    {
        // throw away the right half of the threads
        remaining_items >> 1; // <-- SPOTTED THE BUG
        if (thread_id > remaining_items)
        {
            return;
        }

        // look for a greater value in the right half of the memory space
        int right_index = thread_id + remaining_items;
        float right_value = local_value[right_index];
        if (right_value > local_value[thread_id])
        {
            local_value[thread_id] = right_value;
        }

        barrier(CLK_GLOBAL_MEM_FENCE);
    }
}

删除行return;和/或local_value[thread_id] = right_value;会导致clBuildProgram成功完成。

我可以在我的所有计算机上重现这个问题（NVIDIA GTX 560，GT 555M，GT 540M，它们都是Fermi 2.1架构）。当使用x64或x86库时，NVIDIA CUDA Toolkit SDK版本4.0,4.1和4.2显而易见。

有没有人知道可能是什么问题？

是否可以自动假设本地（也称为共享）内存为(WORK_GROUP_SIZE) * siezof(its_base_type)？这可以解释为什么当我上面提到的线被删除时它才起作用。

用于复制的最小主机代码（C99兼容）：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define RETURN_THROW(expression) do { cl_int ret = expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)
#define REF_THROW(expression) do { cl_int ret; expression; if (ret) { printf(#expression " FAILED: %d\n" , ret); exit(1); } } while (0)

int main(int argc, char **argv)
{
    // Load the kernel source code into the array source_str
    FILE *fp;

    fp = fopen("testkernel.cl", "rb");
    if (!fp)
    {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    fseek(fp, 0, SEEK_END);
    int filesize = ftell(fp);
    rewind(fp);
    char *source_str = (char*)calloc(filesize, sizeof(char));
    size_t bytes_read = fread(source_str, 1, filesize, fp);
    source_str[bytes_read] = 0;
    fclose(fp);

    // Get platform information
    cl_uint num_platforms;
    RETURN_THROW(clGetPlatformIDs(0, NULL, &num_platforms));

    cl_platform_id *platform_ids = (cl_platform_id *)calloc(num_platforms, sizeof(cl_platform_id));
    RETURN_THROW(clGetPlatformIDs(num_platforms, platform_ids, NULL));

    cl_device_id selected_device_id = NULL;

    printf("available platforms:\n");
    for (cl_uint i = 0; i < num_platforms; i++)
    {
        char platform_name[50];
        RETURN_THROW(clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 50, platform_name, NULL));
        printf("%s\n", platform_name);

        // get devices for this platform
        cl_uint num_devices;
        RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));

        cl_device_id *device_ids = (cl_device_id *)calloc(num_devices, sizeof(cl_device_id));
        RETURN_THROW(clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, num_devices, device_ids, NULL));

        // select first nvidia device
        if (strstr(platform_name, "NVIDIA"))        // ADAPT THIS ACCORDINGLY
        {
            selected_device_id = device_ids[0];
        }
    }

    if (selected_device_id == NULL)
    {
        printf("No NVIDIA device found\n");
        exit(1);
    }

    // Create an OpenCL context
    cl_context context;
    REF_THROW(context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret));

    // Create a program from the kernel source
    cl_program program;
    REF_THROW(program = clCreateProgramWithSource(context, 1, (const char **)&source_str, NULL, &ret));

    // Build the program
    cl_int ret = clBuildProgram(program, 1, &selected_device_id, NULL, NULL, NULL);
    if (ret)
    {
        printf("BUILD ERROR\n");
        // build error - get build log and display it
        size_t build_log_size;
        ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size);
        char *build_log = new char[build_log_size];
        ret = clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL);
        printf("%s\n", build_log);
        exit(1);
    }

    printf("build finished successfully\n");
    return 0;
}

Answer 1

根据我的经验，nvidia编译器在处理构建错误方面不是很强大，所以你可能在某处有编译错误。

我认为您的问题确实是return，或更多与barrier的组合。根据关于障碍的opencl规范：

在处理器上执行内核的工作组中的所有工作项   必须先执行此功能才能允许继续   执行超越障碍。所有人都必须遇到这个功能   执行内核的工作组中的工作项。

如果障碍在条件语句中，则所有工作项必须输入   如果有任何工作项输入条件语句和   执行障碍。

如果barrer在循环内，则所有工作项   必须在循环之前为循环的每次迭代执行屏障   允许继续执行超越障碍。

所以我认为你的问题可能是很多线程在进入障碍之前会返回，这使得这些代码无效。也许你应该尝试这样的事情：

kernel void testkernel(local float *local_value) {
    size_t thread_id = get_local_id(0);
    int remaining_items = 1024;
    while (remaining_items > 1) {
        remaining_items >>= 1;// throw away the right half of the threads
        if (thread_id <= remaining_items) {
             // look for a greater value in the right half of the memory space
             int right_index = thread_id + remaining_items;
             float right_value = local_value[right_index];
             if (right_value > local_value[thread_id])
                 local_value[thread_id] = right_value;
        }
        barrier(CLK_GLOBAL_MEM_FENCE);
    }
}

修改此外，正如评论中所述，它必须是remaining_items>>=1而不是remaining_items>>1，以避免产生无限循环。

构建此特定内核时，clBuildProgram会产生AccessViolationException

1 个答案: