OPENCL添加矩阵

时间:2012-04-19 00:09:34

标签: c opencl

我正在尝试添加每个100个单元格的2个矩阵。我需要在并行任务中完成它。没有数据并行。我得到以下代码,在同一个矩阵中添加,多个,减法,除法,但是当我运行它时它只返回0,或者某个时间2,-0,-2等......

我需要在MAC中使用OpenCL 任何想法如何做到这一点?

#include <stdio.h>
#include <stdlib.h>

#include <OpenCL/opencl.h>

#define MAX_SOURCE_SIZE (0x100000)

const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 0; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  + B[base+0]; \n" \
"    C[base+4]  = A[base+4]  + B[base+4]; \n" \
"    C[base+8]  = A[base+8]  + B[base+8]; \n" \
"    C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 1; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  - B[base+0]; \n" \
"    C[base+4]  = A[base+4]  - B[base+4]; \n" \
"    C[base+8]  = A[base+8]  - B[base+8]; \n" \
"    C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 2; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  * B[base+0]; \n" \
"    C[base+4]  = A[base+4]  * B[base+4]; \n" \
"    C[base+8]  = A[base+8]  * B[base+8]; \n" \
"    C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 3; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  / B[base+0]; \n" \
"    C[base+4]  = A[base+4]  / B[base+4]; \n" \
"    C[base+8]  = A[base+8]  / B[base+8]; \n" \
"    C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";

int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    cl_mem Amobj = NULL;
    cl_mem Bmobj = NULL;
    cl_mem Cmobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    int i, j;
    float* A;
    float* B;
    float* C;

    A = (float*)malloc(4*4*sizeof(float));
    B = (float*)malloc(4*4*sizeof(float));
    C = (float*)malloc(4*4*sizeof(float));

    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            B[i*4+j] = j*4+i+1;
        }
    }

    /* Get platform/device information */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);

    /* Create buffer object */
    Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);

    /* Copy input data to memory buffer */
    ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);

    /* Create kernel from source */
    program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* Create task parallel OpenCL kernel */
    kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
    kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
    kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
    kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);

    /* Set OpenCL kernel arguments */
    for (i=0; i<4; i++) {
        ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
        ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
        ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
    }

    /* Execute OpenCL kernel as task parallel */
    for (i=0; i<4; i++) {
        ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL);
    }

    /* Copy result to host */
    ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);

    /* Display result */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }

    /* Finalization */
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel[0]);
    ret = clReleaseKernel(kernel[1]);
    ret = clReleaseKernel(kernel[2]);
    ret = clReleaseKernel(kernel[3]);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(Amobj);
    ret = clReleaseMemObject(Bmobj);
    ret = clReleaseMemObject(Cmobj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);

    free(A);
    free(B);
    free(C);

    return 0;
}

1 个答案:

答案 0 :(得分:1)

由于clCreateCommandQueue正在返回-35: CL_INVALID_QUEUE_PROPERTIES,因此没有创建command_queue,所以基本上什么都没有工作(没有内核甚至可以运行)。您只是打印出C矩阵内存映射到的任何随机内存值(因为它未初始化)。您确实需要检查所有API调用的返回值是否存在错误,这些错误会立即突出显示。

错误与您使用CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE标志有关。这显然不受支持,并且无论如何它并不真正做你想要的。此标志告诉OpenCL运行时内核不需要按照它们排队的相同顺序执行,用于给定队列。但是,操作的本质仍然是具有数据并行性的顺序内核执行。这与并发运行内核不同,这是任务并行执行所需的。

您需要做的是创建四个命令队列,每个内核一个。然后,您可以等待所有队列的事件完成。如果您共享相同的输出矩阵,则需要小心,以确保您不会意外引入竞争条件。

任务并行模型在OpenCL 1.2参考手册的3.4.2节中描述。运行这样的多个队列时,您可能希望使用事件来跟踪每个队列的执行和完成状态。有关详细信息,请参阅参考文献的第5.9节。

这是您的测试代码,使用多个队列进行更新并并行运行任务。我做了一个快速验证,结果是正确的。

#include <stdio.h>
#include <stdlib.h>

#include <OpenCL/opencl.h>

#define MAX_SOURCE_SIZE (0x100000)

const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 0; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  + B[base+0]; \n" \
"    C[base+4]  = A[base+4]  + B[base+4]; \n" \
"    C[base+8]  = A[base+8]  + B[base+8]; \n" \
"    C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 1; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  - B[base+0]; \n" \
"    C[base+4]  = A[base+4]  - B[base+4]; \n" \
"    C[base+8]  = A[base+8]  - B[base+8]; \n" \
"    C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 2; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  * B[base+0]; \n" \
"    C[base+4]  = A[base+4]  * B[base+4]; \n" \
"    C[base+8]  = A[base+8]  * B[base+8]; \n" \
"    C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
"    int base = 3; \n" \
"     \n" \
"    C[base+0]  = A[base+0]  / B[base+0]; \n" \
"    C[base+4]  = A[base+4]  / B[base+4]; \n" \
"    C[base+8]  = A[base+8]  / B[base+8]; \n" \
"    C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";

int main()
{
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_context context = NULL;
    cl_command_queue command_queue[4] = {NULL, NULL, NULL, NULL};
    cl_mem Amobj = NULL;
    cl_mem Bmobj = NULL;
    cl_mem Cmobj = NULL;
    cl_program program = NULL;
    cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;

    int i, j;
    float* A;
    float* B;
    float* C;

    A = (float*)malloc(4*4*sizeof(float));
    B = (float*)malloc(4*4*sizeof(float));
    C = (float*)malloc(4*4*sizeof(float));

    /* Initialize input data */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            A[i*4+j] = i*4+j+1;
            printf("A[%u] = %u\n", i*4+j, i*4+j+1);
            B[i*4+j] = j*4+i+1;
            printf("B[%u] = %u\n", i*4+j, j*4+i+1);
        }
    }

    /* Get platform/device information */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL Context */
    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create buffer object */
    Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
    Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);

    /* Set up each queue */
    for (i = 0; i < 4; i++)
    {
        command_queue[i] = clCreateCommandQueue(context, device_id, 0, &ret);

        /* Copy input data to memory buffer */
        ret = clEnqueueWriteBuffer(command_queue[i], Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
        ret = clEnqueueWriteBuffer(command_queue[i], Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
    }

    /* Create kernel from source */
    program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
    ret     = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* Create task parallel OpenCL kernel */
    kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
    kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
    kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
    kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);

    /* Set OpenCL kernel arguments */
    for (i=0; i<4; i++) {
        ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
        ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
        ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
    }

    /* Execute OpenCL kernel as task parallel */
    for (i=0; i<4; i++) {
        ret = clEnqueueTask(command_queue[i], kernel[i], 0, NULL, NULL);
    }

    /* Wait for each queue to finish */
    for (i=0; i<4; i++) {
        printf("Waiting for %u to finish...\n", i);
        ret = clFinish(command_queue[i]);
    }

    ret = clEnqueueReadBuffer(command_queue[0], Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);

    /* Display result */
    for (i=0; i<4; i++) {
        for (j=0; j<4; j++) {
            printf("%7.2f ", C[i*4+j]);
        }
        printf("\n");
    }

    /* Finalization */
    ret = clReleaseKernel(kernel[0]);
    ret = clReleaseKernel(kernel[1]);
    ret = clReleaseKernel(kernel[2]);
    ret = clReleaseKernel(kernel[3]);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(Amobj);
    ret = clReleaseMemObject(Bmobj);
    ret = clReleaseMemObject(Cmobj);
    ret = clReleaseCommandQueue(command_queue[0]);
    ret = clReleaseCommandQueue(command_queue[1]);
    ret = clReleaseCommandQueue(command_queue[2]);
    ret = clReleaseCommandQueue(command_queue[3]);
    ret = clReleaseContext(context);

    free(A);
    free(B);
    free(C);

    return 0;
}