openCL上的约简算法实现

时间:2015-12-04 01:20:00

标签: opencl

我有以下"添加2 ^ 20个花车"使用两个不同的内核在OpenCL上实现:一个应该在标量基础上处理值,另一个将使用float4向量。虽然代码构建时没有任何编译错误,但程序无法运行。我已经在NVIDIA和AMD平台上尝试过这些代码,在这两种情况下它都失败了。当我调试代码时,它给出了堆栈溢出的错误。此代码是Action Book中OpenCL的示例代码。有什么想法为什么代码不运行? 这是代码:

#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "reduction.cl"

#define ARRAY_SIZE 1048576
#define NUM_KERNELS 2

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {

    cl_platform_id platform;
    cl_device_id dev;
    int err;

    /* Identify a platform */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err < 0) {
        perror("Couldn't identify a platform");
        exit(1);
    }

    /* Access a device */
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
    if (err == CL_DEVICE_NOT_FOUND) {
        printf(" GPU is not first! Going on CPU :(");
        err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
    }
    if (err < 0) {
        perror("Couldn't access any devices");
        exit(1);
    }

    return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

    cl_program program;
    FILE *program_handle;
    char *program_buffer, *program_log;
    size_t program_size, log_size;
    int err;

    /* Read program file and place content into buffer */
    program_handle = fopen(filename, "r");
    if (program_handle == NULL) {
        perror("Couldn't find the program file");
        exit(1);
    }
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);
    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    /* Create program from file */
    program = clCreateProgramWithSource(ctx, 1,
        (const char**)&program_buffer, &program_size, &err);
    if (err < 0) {
        perror("Couldn't create the program");
        exit(1);
    }
    free(program_buffer);

    /* Build program */
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err < 0) {

        /* Find size of log and print to std output */
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            0, NULL, &log_size);
        program_log = (char*)malloc(log_size + 1);
        program_log[log_size] = '\0';
        clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
            log_size + 1, program_log, NULL);
        printf("%s\n", program_log);
        free(program_log);
        exit(1);
    }

    return program;
}

int main() {

    /* OpenCL structures */
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel[NUM_KERNELS];
    cl_command_queue queue;
    cl_event prof_event;
    cl_int i, j, err;
    size_t local_size, global_size;
    char kernel_names[NUM_KERNELS][20] =
    { "reduction_scalar", "reduction_vector" };

    /* Data and buffers */
    float data[ARRAY_SIZE];
    float sum, actual_sum, *scalar_sum, *vector_sum;
    cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer;
    cl_int num_groups;
    cl_ulong time_start, time_end, total_time;

    /* Initialize data */
    for (i = 0; i<ARRAY_SIZE; i++) {
        data[i] = 1.0f*i;
    }

    /* Create device and determine local size */
    device = create_device();
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(local_size), &local_size, NULL);
    if (err < 0) {
        perror("Couldn't obtain device information");
        exit(1);
    }

    /* Allocate and initialize output arrays */
    num_groups = ARRAY_SIZE / local_size;
    scalar_sum = (float*)malloc(num_groups * sizeof(float));
    vector_sum = (float*)malloc(num_groups / 4 * sizeof(float));
    for (i = 0; i<num_groups; i++) {
        scalar_sum[i] = 0.0f;
    }
    for (i = 0; i<num_groups / 4; i++) {
        vector_sum[i] = 0.0f;
    }

    /* Create a context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err < 0) {
        perror("Couldn't create a context");
        exit(1);
    }

    /* Build program */
    program = build_program(context, device, PROGRAM_FILE);

    /* Create data buffer */
    data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
        CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
    scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err);
    vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err);
    if (err < 0) {
        perror("Couldn't create a buffer");
        exit(1);
    };

    /* Create a command queue */
    queue = clCreateCommandQueue(context, device,
        CL_QUEUE_PROFILING_ENABLE, &err);
    if (err < 0) {
        perror("Couldn't create a command queue");
        exit(1);
    };

    for (i = 0; i<NUM_KERNELS; i++) {

        /* Create a kernel */
        kernel[i] = clCreateKernel(program, kernel_names[i], &err);
        if (err < 0) {
            perror("Couldn't create a kernel");
            exit(1);
        };

        /* Create kernel arguments */
        err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer);
        if (i == 0) {
            global_size = ARRAY_SIZE;
            err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer);
        }
        else {
            global_size = ARRAY_SIZE / 4;
            err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL);
            err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer);
        }
        if (err < 0) {
            perror("Couldn't create a kernel argument");
            exit(1);
        }

        /* Enqueue kernel */
        err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size,
            &local_size, 0, NULL, &prof_event);
        if (err < 0) {
            perror("Couldn't enqueue the kernel");
            exit(1);
        }

        /* Finish processing the queue and get profiling information */
        clFinish(queue);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
            sizeof(time_start), &time_start, NULL);
        clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
            sizeof(time_end), &time_end, NULL);
        total_time = time_end - time_start;

        /* Read the result */
        if (i == 0) {
            err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0,
                num_groups * sizeof(float), scalar_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups; j++) {
                sum += scalar_sum[j];
            }
        }
        else {
            err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0,
                num_groups / 4 * sizeof(float), vector_sum, 0, NULL, NULL);
            if (err < 0) {
                perror("Couldn't read the buffer");
                exit(1);
            }
            sum = 0.0f;
            for (j = 0; j<num_groups / 4; j++) {
                sum += vector_sum[j];
            }
        }

        /* Check result */
        printf("%s: ", kernel_names[i]);
        actual_sum = 1.0f * ARRAY_SIZE / 2 * (ARRAY_SIZE - 1);
        if (fabs(sum - actual_sum) > 0.01*fabs(sum))
            printf("Check failed.\n");
        else
            printf("Check passed.\n");
        printf("Total time = %lu\n\n", total_time);

        /* Deallocate event */
        clReleaseEvent(prof_event);
    }

    /* Deallocate resources */
    free(scalar_sum);
    free(vector_sum);
    for (i = 0; i<NUM_KERNELS; i++) {
        clReleaseKernel(kernel[i]);
    }
    clReleaseMemObject(scalar_sum_buffer);
    clReleaseMemObject(vector_sum_buffer);
    clReleaseMemObject(data_buffer);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);
    return 0;
}

这里是存储在&#34; reduction.cl&#34;中的内核。文件:

__kernel void reduction_scalar(__global float* data, 
      __local float* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = partial_sums[0];
   }
}

__kernel void reduction_vector(__global float4* data, 
      __local float4* partial_sums, __global float* output) {

   int lid = get_local_id(0);
   int group_size = get_local_size(0);

   partial_sums[lid] = data[get_global_id(0)];
   barrier(CLK_LOCAL_MEM_FENCE);

   for(int i = group_size/2; i>0; i >>= 1) {
      if(lid < i) {
         partial_sums[lid] += partial_sums[lid + i];
      }
      barrier(CLK_LOCAL_MEM_FENCE);
   }

   if(lid == 0) {
      output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f));
   }
}

1 个答案:

答案 0 :(得分:1)

data的大小太大了。您需要使用malloc动态分配它。改变

float data[ARRAY_SIZE];

float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE);

解决了这个问题。