Question

您好，
我在OpenCL编码。
我正在转换一个＆＃34; C函数＆＃34;具有从i = 1和j = 1 .PFB开始的2D阵列。

cv::Mat input; //Input :having some data in it ..
//Image input size is :input.rows=288 ,input.cols =640
cv::Mat output(input.rows-2,input.cols-2,CV_32F); //Output buffer
//Image output size is :output.rows=286 ,output.cols =638

这是我想在OpenCL中修改的代码：

for(int i=1;i<output.rows-1;i++)
{
  for(int j=1;j<output.cols-1;j++)
    {
        float xVal = input.at<uchar>(i-1,j-1)-input.at<uchar>(i-1,j+1)+ 2*(input.at<uchar>(i,j-1)-input.at<uchar>(i,j+1))+input.at<uchar>(i+1,j-1) - input.at<uchar>(i+1,j+1);
        float yVal = input.at<uchar>(i-1,j-1) - input.at<uchar>(i+1,j-1)+ 2*(input.at<uchar>(i-1,j)   - input.at<uchar>(i+1,j))+input.at<uchar>(i-1,j+1)-input.at<uchar>(i+1,j+1);
        output.at<float>(i-1,j-1) = xVal*xVal+yVal*yVal;
    }
}

... 主机代码：

//Input Image size is :input.rows=288 ,input.cols =640 
//Output Image size is :output.rows=286 ,output.cols =638 
 OclStr->global_work_size[0] =(input.cols);
 OclStr->global_work_size[1] =(input.rows);

 size_t outBufSize = (output.rows) * (output.cols) * 4;//4 as I am copying all 4 uchar values into one float variable space

    cl_mem cl_input_buffer = clCreateBuffer(
        OclStr->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR ,
        (input.rows) * (input.cols),
        static_cast<void *>(input.data), &OclStr->returnstatus);

    cl_mem cl_output_buffer = clCreateBuffer(
        OclStr->context, CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR , 
        (output.rows) * (output.cols) * sizeof(float), 
        static_cast<void *>(output.data), &OclStr->returnstatus);

OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 0, sizeof(cl_mem), (void *)&cl_input_buffer);
OclStr->returnstatus = clSetKernelArg(OclStr->objkernel, 1, sizeof(cl_mem), (void *)&cl_output_buffer); 

    OclStr->returnstatus = clEnqueueNDRangeKernel(
        OclStr->command_queue, 
        OclStr->objkernel, 
        2, 
        NULL, 
        OclStr->global_work_size, 
        NULL, 
        0, 
        NULL, 
        NULL
        );
clEnqueueMapBuffer(OclStr->command_queue, cl_output_buffer, true, CL_MAP_READ, 0, outBufSize, 0, NULL, NULL, &OclStr->returnstatus);

内核代码：

__kernel void Sobel_uchar (__global uchar *pSrc, __global float *pDstImage)              
{                                                                                      
const uint cols = get_global_id(0)+1;                                              
const uint rows = get_global_id(1)+1;                                              
const uint width= get_global_size(0);                                              
uchar Opsoble[8];                                                                  
Opsoble[0] = pSrc[(cols-1)+((rows-1)*width)];                           
Opsoble[1] = pSrc[(cols+1)+((rows-1)*width)];                           
Opsoble[2] = pSrc[(cols-1)+((rows+0)*width)];                           
Opsoble[3] = pSrc[(cols+1)+((rows+0)*width)];                           
Opsoble[4] = pSrc[(cols-1)+((rows+1)*width)];                           
Opsoble[5] = pSrc[(cols+1)+((rows+1)*width)];                           
Opsoble[6] = pSrc[(cols+0)+((rows-1)*width)];                           
Opsoble[7] = pSrc[(cols+0)+((rows+1)*width)];                           
float gx =   Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5];
float gy =   Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5];
pDstImage[(cols-1)+(rows-1)*width] = gx*gx + gy*gy;                                

    }

这里我无法按预期获得输出。我有一些问题

我的for循环从i = 1而不是0开始，那么如何通过在x和y方向使用global_id（）来获得正确的索引
上面的内核代码出了什么问题：（

我怀疑缓冲区步幅有问题，但由于已经在一天内完全打破了我的脑袋而无法进一步打破我的头:( 我观察到以下逻辑输出在一些7/8帧序列之后跳过一帧或两帧。我添加了输出的屏幕截图，并与参考输出进行了比较。我的上述逻辑是对我的输入进行部分修改。我将宽度更改为 -

const uint width = get_global_size(0)+1;

PFB

非常欢迎您的建议!!! enter image description here

Answer 1

看起来您可能在opencl版本中以（y，x）格式获取值。此外，您需要在全局ID中添加1，以便从1而不是0开始复制for循环。

我不知道为什么有一个未使用的iOffset变量。也许你的bug与此有关？我在我的版本中删除了它。

这个内核对你有用吗？

__kernel void simple(__global uchar *pSrc, __global float *pDstImage)              
{                                                                                      
   const uint i = get_global_id(0) +1;                                                  
   const uint j = get_global_id(1) +1;                                                  
   const uint width = get_global_size(0) +2;                                              

   uchar Opsoble[8];                                                                   
   Opsoble[0] = pSrc[(i-1) + (j - 1)*width];                                           
   Opsoble[1] = pSrc[(i-1) + (j + 1)*width];                                           
   Opsoble[2] = pSrc[i + (j-1)*width];                                                 
   Opsoble[3] = pSrc[i + (j+1)*width];                                                 
   Opsoble[4] = pSrc[(i+1) + (j - 1)*width];                                           
   Opsoble[5] = pSrc[(i+1) + (j + 1)*width];                                           
   Opsoble[6] = pSrc[(i-1) + (j)*width];                                               
   Opsoble[7] = pSrc[(i+1) + (j)*width];
   float gx =   Opsoble[0]-Opsoble[1]+2*(Opsoble[2]-Opsoble[3])+Opsoble[4]-Opsoble[5]; 
   float gy =   Opsoble[0]-Opsoble[4]+2*(Opsoble[6]-Opsoble[7])+Opsoble[1]-Opsoble[5]; 
   pDstImage[(i-1) + (j-1)*width] = gx*gx + gy*gy ;                                     
}

Answer 2

我有点担心发布建议对内核进行优化的答案，因为原始输出尚未完全复制。对于与图像处理/过滤相关的问题，可以进行重大改进。

使用本地存储器可以通过将全局读取次数减少8倍来帮助您，并使用单个每像素写入输出将全局写入分组以获得潜在增益。

下面的内核从pSrc读取最多34x34的块，并输出pDstImage的32x32（max）区域。我希望代码中的注释足以指导您使用内核。我无法对此进行全面测试，因此可能需要进行更改。任何评论也会受到赞赏。

__kernel void sobel_uchar_wlocal (__global uchar *pSrc, __global float *pDstImage, __global uint2 dimDstImage)
{
    //call this kernel 1-dimensional work group size: 32x1
    //calculates 32x32 region of output with 32 work items

    const uint wid = get_local_id(0);
    const uint wid_1 = wid+1; // corrected for the calculation step
    const uint2 gid = (uint2)(get_group_id(0),get_group_id(1));
    const uint localDim = get_local_size(0);

    const uint2 globalTopLeft = (uint2)(localDim * gid.x, localDim * gid.y); //position in pSrc to copy from/to

    //dimLocalBuff is used for the right and bottom edges of the image, where the work group may run over the border
    const uint2 dimLocalBuff = (uint2)(localDim,localDim);
    if(dimDstImage.x - globalTopLeft.x < dimLocalBuff.x){
        dimLocalBuff.x = dimDstImage.x - globalTopLeft.x;
    }
    if(dimDstImage.y - globalTopLeft.y < dimLocalBuff.y){
        dimLocalBuff.y = dimDstImage.y - globalTopLeft.y;
    }

    int i,j;

    //save region of data into local memory
    __local uchar srcBuff[34][34]; //34^2 uchar = 1156 bytes
    for(j=-1;j<dimLocalBuff.y+1;j++){
        for(i=x-1;i<dimLocalBuff.x+1;i+=localDim){
            srcBuff[i+1][j+1] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j];
        }
    }
    mem_fence(CLK_LOCAL_MEM_FENCE);

    //compute output and store locally
    __local float dstBuff[32][32]; //32^2 float = 4096 bytes
    if(wid_1 < dimLocalBuff.x){
        for(i=0;i<dimLocalBuff.y;i++){
            float gx = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1-1)+ (i + 1)]+2*(srcBuff[wid_1+ (i-1)]-srcBuff[wid_1+ (i+1)])+srcBuff[(wid_1+1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i + 1)]; 
            float gy = srcBuff[(wid_1-1)+ (i - 1)]-srcBuff[(wid_1+1)+ (i - 1)]+2*(srcBuff[(wid_1-1)+ (i)]-srcBuff[(wid_1+1)+ (i)])+srcBuff[(wid_1-1)+ (i + 1)]-srcBuff[(wid_1+1)+ (i + 1)]; 
            dstBuff[wid][i] = gx*gx + gy*gy;
        }
    }
    mem_fence(CLK_LOCAL_MEM_FENCE);

    //copy results to output
    for(j=0;j<dimLocalBuff.y;j++){
        for(i=0;i<dimLocalBuff.x;i+=localDim){
            srcBuff[i][j] = pSrc[globalTopLeft.x+i][globalTopLeft.y+j];
        }
    }
}

OpenCL：使用globalid（。）访问正确的索引

2 个答案: