如何将行处理内核过滤器和列处理内核过滤器合并为单个openCL内核

时间:2014-11-06 13:46:31

标签: c optimization opencl

我在openCL中使用行和列处理实现了图像处理过滤器,其中大多数处理在启动多个内核时都被浪费了。

所以我可以将这两个内核合并到单个内核中,这些内核具有相同的功能,并且在英特尔HD4600显卡中表现更好。代码详情如下: -

假设:
1.水平和垂直填充均由主机(c编程)完成 2. N(滤波器长度为8,宽度和高度为1024 x 1024,滤波器系数使用通用滤波器生成 3.使用以下API来启动第一行然后使用col Kernel ret | = clEnqueueNDRangeKernel(command_queue,kernel,2,NULL,global_ws(1024x1024),NULL,0,NULL,NULL);

//代码:

__kernel void filter_rows(__global float *ip_img,__global float *op_img,
                          int width, int height,int pitch,int N,__constant float *W)
{
    __private int i=get_global_id(0); 
    __private int j=get_global_id(1); 
    __private int k;
    __private float a;
    __private int image_offset = N*pitch +N;
    __private int curr_pix = j*pitch + i +image_offset;
    // apply filter
    for(k=-N, a=0.0f; k<=N; k++)
    {
    a += ip_img[curr_pix+k] * W[k+N];
    }
    op_img[curr_pix] = a;   
}
__kernel void filter_col(__global float *ip_img,__global float *op_img,int width,
                         int height,int pitch,int N,__constant float *W)
{
    __private int i=get_global_id(0);
    __private int j=get_global_id(1);
    __private int k;
    __private float a;
    __private int image_offset = N*pitch +N;
    __private int curr_pix = j*pitch + i +image_offset;

    // apply filter
    for(k=-N, a=0.0f; k<=N; k++)
    {
      a += ip_img[k*pitch +curr_pix] * W[k+N];                  
    }
    op_img[curr_pix] = a;
}
void padd_hor(float *ip_img,pad_leng)
{
    //...using simple C programming
}
void padd_ver(float *ip_img,pad_leng)
{
    //...using simple c programming
}
void generic_filter(_global float *in_image,__global float *out_image,
__global float *temp_image,int width, int height,int pitch,int N,
__constant float *Wr,__constant float *Wc)
{
    padd_hor(in_image,filter_length)
    filter_rows(in_image,temp_image,width,height,pitch,filter_length,filter_coeff_hor);
    pad_ver(temp_image,filter_length)
    filter_col(temp_image,out_image,width,height,pitch,filter_length,filter_coeff_ver);
}
__kernel generic_filter(_global float *in_image,__global float *out_image,__global      float*temp_image,
int width, int height,int pitch,int N,__constant float *Wr,__constant float *Wc)
{
    // ... here i need your suggetion to implement the kernel which do same as generic_filter
}

我们非常感谢您的帮助,以优化此过滤器并获得最佳效果 另请告诉我在英特尔CPU上运行的C代码可以获得多大的最大收益。

谢谢和问候 Vijayky88

0 个答案:

没有答案