在OpenCL中高效重复调用NDRangeKernel

时间:2012-02-20 11:47:28

标签: opencl

我写了以下代码。我有一个循环,它在两个红色和黑色内核之间迭代。在每次迭代中,我调用clEnqueueReadBuffer,我认为这是无效的。有没有其他方法可以有效地重复调用内核? 感谢

#include <stdio.h>
#include <stdlib.h> 
#include <string>
#include <iostream>
#include <cmath>
#include <ctime>
#include <ocl

Utils.h>

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif


#define DATA_SIZE (1048576)
#define NANO_TO_MILI 1e6
#define MAX_ITER 1
#define LIMIT 100
#define BIG_RANGE LIMIT*4*100

#define EPS 1e-2
#define SQ 1024

#define A(i,j) A[i*SQ+j]

using namespace std;

cl_platform_id platforms;
cl_device_id device;
cl_context context;
cl_program program1, program2;
cl_command_queue command;
cl_int err;
cl_kernel kernel_red, kernel_black;
cl_int i;
cl_mem input_A,input_b,in_out_X;
cl_event timing_event;
cl_ulong time_start, time_end,total_time = 0;


const char options[] = "-cl-mad-enable -cl-finite-math-only -Werror -DWIDTH=1024 -DHEIGHT=1024";
char *kernel_names[] = {"Red","Black"};

float norm (float*,float*,int);
void swap(float **in, float **out); 

void CreateQueue(void);
void CreateKernel(void);
void CreateBuffer(unsigned int);
void Enqueue_Write_Buffer(unsigned int);
void Kernel_Arg_Set(cl_kernel, unsigned int);
void Enqueue_Read_Buffer(unsigned int);
void Create_Work_Group(cl_kernel, unsigned int);
void Shutdown();

float *A,*oldX,*newX,*b;

int main(int argc, char** argv) {
unsigned int count = DATA_SIZE;
int i,j;
clock_t start,end;
float *XX,*XXnew;

    A = (float*)malloc(sizeof(float)*count);
    newX = (float*)malloc(sizeof(float)*SQ);
    oldX = (float*)malloc(sizeof(float)*SQ);
    b = (float*)malloc(sizeof(float)*SQ);

    XX = (float*)malloc(sizeof(float)*SQ);

    float h=1.0f/SQ;
    float xx[SQ];

    for (i=0;i<SQ;i++){
        XX[i] = 0.0f;
        oldX[i]=0.0f;
        xx[i] = 0.0f + (i+1)*h;
        if (i != 0) b[i] = -2.0f*xx[i]; else b[i] = -2.0f*xx[i]-1.0f/(h*h)+1.0f/(2.0f*h);
        for(j=0;j<SQ;j++) A(i,j) =0.0f;
        A(i,i) = -2.0f/(h*h);
        if (i!=SQ-1) A(i,i+1) = 1.0f/(h*h) + 1.0f/(2.0f*h); else A(i,i+1) = 0.0f;
        if (i != 0)  A(i,i-1) = 1.0f/(h*h) - 1.0f/(2.0f*h); else A(i,i-1) = 0.0f;
    }


    newX[0] = BIG_RANGE;

    int cnt = 0;

    CreateQueue();

    CreateKernel();

    CreateBuffer(count);



    Kernel_Arg_Set(kernel_red  ,count);
    Kernel_Arg_Set(kernel_black,count);

    end=0.0f;start =clock();cnt =0;

    Enqueue_Write_Buffer(count);


    while(norm(oldX,newX,SQ) > EPS && cnt<LIMIT){

    Create_Work_Group(kernel_red, count);

    Enqueue_Read_Buffer(count);

    Create_Work_Group(kernel_black, count);

    cnt++;

    Enqueue_Read_Buffer(count);

    }

    clFinish(command);

    Shutdown();


    free(oldX);
    free(newX);
    free(XX);
    free(XXnew);
    return 0;
}




void CreateQueue(){
err = clGetPlatformIDs(1, &platforms, NULL);
if(err<0){
    perror("no platform");getchar();exit(1);}

err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &device,NULL);
if(err<0){
    perror("no device");getchar();exit(1);}

context = clCreateContext(NULL, 1, &device,NULL, NULL, &err);
if(err < 0) {
    perror("Couldn't create a context");exit(1);}

command = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if (!command)
    {
        printf("Error: Failed to create a command commands!\n");
        exit(1);
    }

clEnqueueBarrier(command);


}

void CreateBuffer(unsigned int count){

    input_A  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * count, A, NULL);
    in_out_X = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, oldX, NULL);
    input_b  = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * SQ, b, NULL);

    if (!input_A || !input_b || !in_out_X)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }    
}


void CreateKernel(){

    FILE *fp;
    size_t program_size;
    string kernel_src;
    fp = fopen("Red.cl", "r");
    fseek(fp, 0, SEEK_END);
    program_size = ftell(fp);
    kernel_src.resize(program_size + 1);
    fseek(fp, 0, SEEK_SET);
    fread(&kernel_src[0], program_size, 1, fp);
    fclose(fp);
    kernel_src[program_size] = '\0';


const char *src = &kernel_src[0];
program1 = clCreateProgramWithSource(context, 1,&src, NULL, &err);

if (!program1)
   {
      printf("clCreateProgramWithSource failed\n");
      exit(1);
   }

err =clBuildProgram(program1, 1, &device, options, NULL, NULL);

if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2*2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program1, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }



kernel_red   = clCreateKernel(program1, kernel_names[0], &err);

if (!kernel_red || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }


kernel_black   = clCreateKernel(program1, kernel_names[1], &err);

if (!kernel_black || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

}

void Create_Work_Group(cl_kernel kernel, unsigned int count){

    size_t global[] = {SQ,SQ,0};
    size_t local[] = {32,32,0};
    err = clEnqueueNDRangeKernel(command, kernel, 2, NULL, global, local, 0, NULL,NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        exit(1);
    }
}

void Kernel_Arg_Set(cl_kernel kernel,unsigned int count){
    err  = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_A);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_out_X);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_b);

    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
}

void Enqueue_Read_Buffer(unsigned int count){   
    err = clEnqueueReadBuffer( command, in_out_X, CL_TRUE, 0, sizeof(float) * SQ, oldX, 0, NULL, NULL );  
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
}

void Enqueue_Write_Buffer(unsigned int count){
     err  = clEnqueueWriteBuffer(command, input_A , CL_FALSE, 0, sizeof(float) * count,   A, 0, NULL,  NULL);
     err |= clEnqueueWriteBuffer(command, input_b , CL_FALSE, 0, sizeof(float) * SQ   ,   b, 0, NULL,  NULL);
     err |= clEnqueueWriteBuffer(command, in_out_X, CL_FALSE, 0, sizeof(float) * SQ   ,oldX, 0, NULL,  NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }

}

1 个答案:

答案 0 :(得分:2)

你所做的是非常低效的。您只能编写一次缓冲区,然后根据需要将尽可能多的内核排入队列,并使用与其参数相同的缓冲区。当然,如果您需要计算规范,则需要重新读取数据。我会建议这样的事情:

  1. 为规范创建额外的缓冲区;在每个内核的开头检查规范是什么(只需读取它的值);如果它小于阈值,立即返回。

  2. 创建一个新内核,为您计算标准。

  3. Enque任务如:

    • 写缓冲区,
    • 内核:{{red,black} * 10,updateNorm} * 10
    • 读取缓冲区。

    计算将运行10次,然后将更新规范。如果它已经好了,已经排队的计算内核将立即重新启动。队列完成后,读取缓冲区并检查CPU上的规范。如果规范仍然不正常,则再次将相同批次的内核排入队列。

    在最坏的情况下,你将浪费9个真正的和90个立即返回的内核运行。