我正在尝试添加每个100个单元格的2个矩阵。我需要在并行任务中完成它。没有数据并行。我得到以下代码,在同一个矩阵中添加,多个,减法,除法,但是当我运行它时它只返回0,或者某个时间2,-0,-2等......
我需要在MAC中使用OpenCL 任何想法如何做到这一点?
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define MAX_SOURCE_SIZE (0x100000)
const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 0; \n" \
" \n" \
" C[base+0] = A[base+0] + B[base+0]; \n" \
" C[base+4] = A[base+4] + B[base+4]; \n" \
" C[base+8] = A[base+8] + B[base+8]; \n" \
" C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 1; \n" \
" \n" \
" C[base+0] = A[base+0] - B[base+0]; \n" \
" C[base+4] = A[base+4] - B[base+4]; \n" \
" C[base+8] = A[base+8] - B[base+8]; \n" \
" C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 2; \n" \
" \n" \
" C[base+0] = A[base+0] * B[base+0]; \n" \
" C[base+4] = A[base+4] * B[base+4]; \n" \
" C[base+8] = A[base+8] * B[base+8]; \n" \
" C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 3; \n" \
" \n" \
" C[base+0] = A[base+0] / B[base+0]; \n" \
" C[base+4] = A[base+4] / B[base+4]; \n" \
" C[base+8] = A[base+8] / B[base+8]; \n" \
" C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
int i, j;
float* A;
float* B;
float* C;
A = (float*)malloc(4*4*sizeof(float));
B = (float*)malloc(4*4*sizeof(float));
C = (float*)malloc(4*4*sizeof(float));
/* Initialize input data */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
A[i*4+j] = i*4+j+1;
B[i*4+j] = j*4+i+1;
}
}
/* Get platform/device information */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue */
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
/* Create buffer object */
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
/* Copy input data to memory buffer */
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create task parallel OpenCL kernel */
kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);
/* Set OpenCL kernel arguments */
for (i=0; i<4; i++) {
ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
}
/* Execute OpenCL kernel as task parallel */
for (i=0; i<4; i++) {
ret = clEnqueueTask(command_queue, kernel[i], 0, NULL, NULL);
}
/* Copy result to host */
ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
/* Display result */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
printf("%7.2f ", C[i*4+j]);
}
printf("\n");
}
/* Finalization */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel[0]);
ret = clReleaseKernel(kernel[1]);
ret = clReleaseKernel(kernel[2]);
ret = clReleaseKernel(kernel[3]);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
答案 0 :(得分:1)
由于clCreateCommandQueue
正在返回-35: CL_INVALID_QUEUE_PROPERTIES
,因此没有创建command_queue,所以基本上什么都没有工作(没有内核甚至可以运行)。您只是打印出C
矩阵内存映射到的任何随机内存值(因为它未初始化)。您确实需要检查所有API调用的返回值是否存在错误,这些错误会立即突出显示。
错误与您使用CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
标志有关。这显然不受支持,并且无论如何它并不真正做你想要的。此标志告诉OpenCL运行时内核不需要按照它们排队的相同顺序执行,用于给定队列。但是,操作的本质仍然是具有数据并行性的顺序内核执行。这与并发运行内核不同,这是任务并行执行所需的。
您需要做的是创建四个命令队列,每个内核一个。然后,您可以等待所有队列的事件完成。如果您共享相同的输出矩阵,则需要小心,以确保您不会意外引入竞争条件。
任务并行模型在OpenCL 1.2参考手册的3.4.2节中描述。运行这样的多个队列时,您可能希望使用事件来跟踪每个队列的执行和完成状态。有关详细信息,请参阅参考文献的第5.9节。
这是您的测试代码,使用多个队列进行更新并并行运行任务。我做了一个快速验证,结果是正确的。
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define MAX_SOURCE_SIZE (0x100000)
const char *_kernel = "\n" \
"__kernel void taskParallelAdd(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 0; \n" \
" \n" \
" C[base+0] = A[base+0] + B[base+0]; \n" \
" C[base+4] = A[base+4] + B[base+4]; \n" \
" C[base+8] = A[base+8] + B[base+8]; \n" \
" C[base+12] = A[base+12] + B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelSub(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 1; \n" \
" \n" \
" C[base+0] = A[base+0] - B[base+0]; \n" \
" C[base+4] = A[base+4] - B[base+4]; \n" \
" C[base+8] = A[base+8] - B[base+8]; \n" \
" C[base+12] = A[base+12] - B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelMul(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 2; \n" \
" \n" \
" C[base+0] = A[base+0] * B[base+0]; \n" \
" C[base+4] = A[base+4] * B[base+4]; \n" \
" C[base+8] = A[base+8] * B[base+8]; \n" \
" C[base+12] = A[base+12] * B[base+12]; \n" \
"} \n" \
" \n" \
"__kernel void taskParallelDiv(__global float* A, __global float* B, __global float* C) \n" \
"{ \n" \
" int base = 3; \n" \
" \n" \
" C[base+0] = A[base+0] / B[base+0]; \n" \
" C[base+4] = A[base+4] / B[base+4]; \n" \
" C[base+8] = A[base+8] / B[base+8]; \n" \
" C[base+12] = A[base+12] / B[base+12]; \n" \
"} \n" \
" \n";
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue[4] = {NULL, NULL, NULL, NULL};
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel[4] = {NULL, NULL, NULL, NULL};
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
int i, j;
float* A;
float* B;
float* C;
A = (float*)malloc(4*4*sizeof(float));
B = (float*)malloc(4*4*sizeof(float));
C = (float*)malloc(4*4*sizeof(float));
/* Initialize input data */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
A[i*4+j] = i*4+j+1;
printf("A[%u] = %u\n", i*4+j, i*4+j+1);
B[i*4+j] = j*4+i+1;
printf("B[%u] = %u\n", i*4+j, j*4+i+1);
}
}
/* Get platform/device information */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL Context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
/* Create buffer object */
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret);
/* Set up each queue */
for (i = 0; i < 4; i++)
{
command_queue[i] = clCreateCommandQueue(context, device_id, 0, &ret);
/* Copy input data to memory buffer */
ret = clEnqueueWriteBuffer(command_queue[i], Amobj, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[i], Bmobj, CL_TRUE, 0, 4*4*sizeof(float), B, 0, NULL, NULL);
}
/* Create kernel from source */
program = clCreateProgramWithSource(context, 1, (const char **)&_kernel, NULL, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create task parallel OpenCL kernel */
kernel[0] = clCreateKernel(program, "taskParallelAdd", &ret);
kernel[1] = clCreateKernel(program, "taskParallelSub", &ret);
kernel[2] = clCreateKernel(program, "taskParallelMul", &ret);
kernel[3] = clCreateKernel(program, "taskParallelDiv", &ret);
/* Set OpenCL kernel arguments */
for (i=0; i<4; i++) {
ret = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&Cmobj);
}
/* Execute OpenCL kernel as task parallel */
for (i=0; i<4; i++) {
ret = clEnqueueTask(command_queue[i], kernel[i], 0, NULL, NULL);
}
/* Wait for each queue to finish */
for (i=0; i<4; i++) {
printf("Waiting for %u to finish...\n", i);
ret = clFinish(command_queue[i]);
}
ret = clEnqueueReadBuffer(command_queue[0], Cmobj, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL);
/* Display result */
for (i=0; i<4; i++) {
for (j=0; j<4; j++) {
printf("%7.2f ", C[i*4+j]);
}
printf("\n");
}
/* Finalization */
ret = clReleaseKernel(kernel[0]);
ret = clReleaseKernel(kernel[1]);
ret = clReleaseKernel(kernel[2]);
ret = clReleaseKernel(kernel[3]);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue[0]);
ret = clReleaseCommandQueue(command_queue[1]);
ret = clReleaseCommandQueue(command_queue[2]);
ret = clReleaseCommandQueue(command_queue[3]);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}