使用一个内核的OpenCL高斯卷积较慢

时间:2014-04-02 10:24:47

标签: performance kernel opencl gaussian barrier

我使用两个1D高斯滤波器(高斯可分离性)在OpenCL上实现了2D高斯滤波器。

我实现了我的卷积的2个版本:   - 第一个使用一个内核,在行上应用1D过滤器,然后转换图像(大约需要20毫秒)

  • 第二个使用两个内核:一个应用卷积,一个转换图像(需要大约7ms - 卷积和大约1ms - 转置图像)

我评估了两个实现的计算时间,并且我发现使用两个内核的实现比使用一个内核的实现要快 (注意转置内核必须等待卷积内核)。

你能帮我弄清楚为什么使用一个内核的实现速度较慢,即使一个内核的设置时间应该比两个内核的设置时间快。

在下面找到两种实现的OpenCL源代码:

  • 一个内核实现

    __kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,__constant int4 *par, __constant float *filter,__local float4 *cache, __local float4 *temp,__write_only image2d_t dstBuffer)
    {
        int width = par[0].x;
        int height = par[0].y;
        int widthG = par[0].z;
        int heightG = par[0].w;
        int gaussFilterWidth = par[1].x;
        int margin = 1 + (gaussFilterWidth>>2);
        int2 location;
        float4 focus,set;
        float *s = &set;
        int m,start;
        int widthGM = widthG + (margin<<1);
    
        const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;
    
        int i = get_global_id(0);
        int j = get_global_id(1);
    
        int iG = get_local_id(0);
        int jG = get_local_id(1);
    
        location.x = i;
        location.y = j;
    
        focus = read_imagef(srcBuffer, sampler, location);
    
        cache[(iG+margin) + jG*widthGM] = focus;
    
        if(iG == 0)
        {
            if(i == 0)
            {
                cache[0 + jG*widthGM].x = focus.x;
                cache[0 + jG*widthGM].y = focus.x;
                cache[0 + jG*widthGM].z = focus.x;
                cache[0 + jG*widthGM].w = focus.x;
                cache[1 + jG*widthGM].x = focus.x;
                cache[1 + jG*widthGM].y = focus.x;
                cache[1 + jG*widthGM].z = focus.x;
                cache[1 + jG*widthGM].w = focus.x;
            }
            else
            {
                location.x = i-margin;
                location.y = j;
                cache[jG*widthGM] =  read_imagef(srcBuffer, sampler, location);
    
                location.x = i-(margin-1);
                location.y = j;
                cache[1 + jG*widthGM] =  read_imagef(srcBuffer, sampler, location);
            }
        }
        if(iG == ((widthGM-1)-(margin<<1)))
        {
            if(i == (width-1))
            {
                cache[(widthGM-2) + jG*widthGM].x = focus.w;
                cache[(widthGM-2) + jG*widthGM].y = focus.w;
                cache[(widthGM-2) + jG*widthGM].z = focus.w;
                cache[(widthGM-2) + jG*widthGM].w = focus.w;
                cache[(widthGM-1) + jG*widthGM].x = focus.w;
                cache[(widthGM-1) + jG*widthGM].y = focus.w;
                cache[(widthGM-1) + jG*widthGM].z = focus.w;
                cache[(widthGM-1) + jG*widthGM].w = focus.w;
            }
            else
            {
                location.x = i+margin;
                location.y = j;
                cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
    
                location.x = i+(margin-1);
                location.y = j;
                cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
            }
        }
    
        barrier(CLK_LOCAL_MEM_FENCE);
    
        float4  bar[10],barX[10],barY[10],barZ[10],barW[10];
        float4  *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW;
        float   *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW;
        float4  gauss[4];
        float   *gf = &gauss;
        float4  acc;
    
    
        gf[0] = filter[0];
        gf[1] = filter[1];
        gf[2] = filter[2];
        gf[3] = filter[3];
        gf[4] = filter[4];
        gf[5] = filter[5];
        gf[6] = filter[6];
        gf[7] = filter[7];
        gf[8] = filter[8];
        gf[9] = filter[9];
        gf[10]= filter[10];
        gf[11]= filter[11];
        gf[12]= filter[12];
        gf[13]= filter[13];
        gf[14]= filter[14];
        gf[15]= 0.0f;
    
    
        start = iG + jG*widthGM;
    
    
        fX[0]                                             = cache[start+0].y;
        fX[1] = fY[0]                                                     = cache[start+0].z;
        fX[2] = fY[1] = fZ[0]                                                 = cache[start+0].w;
        fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x;
        fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y;
        fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z;
        fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w;
        fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x;
        fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y;
        fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z;
        fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w;
        fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x;
        fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y;
        fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z;
        fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w;
        fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x;
        fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y;
        fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z;
        fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;
    
        acc.x =  fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15];
        acc.y =  fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15];
        acc.z =  fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15];
        acc.w =  fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15];
    
    
        temp[iG + jG*widthG] = acc;
    
        barrier(CLK_LOCAL_MEM_FENCE);
    
        int I,S;
    
        I = j >> 2;
        S = j & 3;
    
        if(S == 0)
        {
            set.x = temp[iG + (jG+0)*widthG].x;
            set.y = temp[iG + (jG+1)*widthG].x;
            set.z = temp[iG + (jG+2)*widthG].x;
            set.w = temp[iG + (jG+3)*widthG].x;
        }
        if(S == 1)
        {
            set.x = temp[iG + (jG-1)*widthG].y;
            set.y = temp[iG + (jG+0)*widthG].y;
            set.z = temp[iG + (jG+1)*widthG].y;
            set.w = temp[iG + (jG+2)*widthG].y;
        }
        if(S == 2)
        {
            set.x = temp[iG + (jG-2)*widthG].z;
            set.y = temp[iG + (jG-1)*widthG].z;
            set.z = temp[iG + (jG+0)*widthG].z;
            set.w = temp[iG + (jG+1)*widthG].z;
        }
        if(S == 3)
        {
            set.x = temp[iG + (jG-3)*widthG].w;
            set.y = temp[iG + (jG-2)*widthG].w;
            set.z = temp[iG + (jG-1)*widthG].w;
            set.w = temp[iG + (jG+0)*widthG].w;
        }
    
    
        location.x = I;
        location.y = (i*4 + S);
    
        write_imagef(dstBuffer, location , set);
    
    }
    
  • 两个内核实现

    卷积:

    __kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,
                                    __constant int4 *par,
                                    __constant float *filter,
                                    __local float4 *cache,
                                    __local float4 *temp,
                                    __write_only image2d_t dstBuffer)
    {
        int width = par[0].x;
        int height = par[0].y;
        int widthG = par[0].z;
        int heightG = par[0].w;
        int gaussFilterWidth = par[1].x;
        int margin = 1 + (gaussFilterWidth>>2);
    
        int2 location;
    
        float4 focus,set;
        float *s = &set;
        int m,start;
        int widthGM = widthG + (margin<<1);
    
        const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;
    
        int i = get_global_id(0);
        int j = get_global_id(1);
    
        int iG = get_local_id(0);
        int jG = get_local_id(1);
    
    
    
        location.x = i;
        location.y = j;
    
        focus = read_imagef(srcBuffer, sampler, location);
    
    
        cache[(iG+margin) + jG*widthGM] = focus;
    
    
        if(iG == 0)
        {
            if(i == 0)
            {
    
    
                cache[0 + jG*widthGM].x = focus.x;
                cache[0 + jG*widthGM].y = focus.x;
                cache[0 + jG*widthGM].z = focus.x;
                cache[0 + jG*widthGM].w = focus.x;
                cache[1 + jG*widthGM].x = focus.x;
                cache[1 + jG*widthGM].y = focus.x;
                cache[1 + jG*widthGM].z = focus.x;
                cache[1 + jG*widthGM].w = focus.x;
            }
            else
            {
    
    
                location.x = i-margin;
                location.y = j;
                cache[jG*widthGM] =  read_imagef(srcBuffer, sampler, location);
    
                location.x = i-(margin-1);
                location.y = j;
                cache[1 + jG*widthGM] =  read_imagef(srcBuffer, sampler, location);
            }
        }
    
        if(iG == ((widthGM-1)-(margin<<1)))
        {
            if(i == (width-1))
            {
    
    
                cache[(widthGM-2) + jG*widthGM].x = focus.w;
                cache[(widthGM-2) + jG*widthGM].y = focus.w;
                cache[(widthGM-2) + jG*widthGM].z = focus.w;
                cache[(widthGM-2) + jG*widthGM].w = focus.w;
                cache[(widthGM-1) + jG*widthGM].x = focus.w;
                cache[(widthGM-1) + jG*widthGM].y = focus.w;
                cache[(widthGM-1) + jG*widthGM].z = focus.w;
                cache[(widthGM-1) + jG*widthGM].w = focus.w;
            }
            else
            {
    
    
                location.x = i+margin;
                location.y = j;
                cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
    
                location.x = i+(margin-1);
                location.y = j;
                cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
            }
        }
    
    
        barrier(CLK_LOCAL_MEM_FENCE);
    
        float4  bar[10],barX[10],barY[10],barZ[10],barW[10];
        float4  *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW;
        float   *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW;
        float4  gauss[4];
        float   *gf = &gauss;
        float4  acc;
    
    
        gf[0] = filter[0];
        gf[1] = filter[1];
        gf[2] = filter[2];
        gf[3] = filter[3];
        gf[4] = filter[4];
        gf[5] = filter[5];
        gf[6] = filter[6];
        gf[7] = filter[7];
        gf[8] = filter[8];
        gf[9] = filter[9];
        gf[10]= filter[10];
        gf[11]= filter[11];
        gf[12]= filter[12];
        gf[13]= filter[13];
        gf[14]= filter[14];
        gf[15]= 0.0f;
    
    
        start = iG + jG*widthGM;
    
    
        fX[0]                         = cache[start+0].y;
        fX[1] = fY[0]                 = cache[start+0].z;
        fX[2] = fY[1] = fZ[0]         = cache[start+0].w;
        fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x;
        fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y;
        fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z;
        fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w;
        fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x;
        fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y;
        fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z;
        fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w;
        fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x;
        fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y;
        fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z;
        fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w;
        fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x;
        fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y;
        fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z;
        fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;
    
    
        acc.x =  fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15];
        acc.y =  fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15];
        acc.z =  fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15];
        acc.w =  fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15];
    
        location.x = i;
        location.y = j;
    
        write_imagef(dstBuffer, location , acc);
    
    }
    

    移调:

    __kernel void TransponseKernel(__read_only image2d_t srcBuffer,
                                   __constant int4 *par,
                                   __local float4 *temp,
                                   __write_only image2d_t dstBuffer)
    {
        int width = par[0].x;
        int height = par[0].y;
        int widthG = par[0].z;
        int heightG = par[0].w;
        int gaussFilterWidth = par[1].x;
        int margin = 1 + (gaussFilterWidth>>2);
    
        int widthGM = widthG + (margin<<1);
    
        int2 location;
        float4 focus,set;
    
        const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;
    
        int i = get_global_id(0);
        int j = get_global_id(1);
    
        int iG = get_local_id(0);
        int jG = get_local_id(1);
    
        location.x = i;
        location.y = j;
    
        focus = read_imagef(srcBuffer, sampler, location);
    
    
        temp[iG + jG*widthG] = focus;
    
        barrier(CLK_LOCAL_MEM_FENCE);
    
        int I,S;
    
    
        I = j >> 2;
        S = j & 3;
    
        if(S == 0)
        {
            set.x = temp[iG + (jG+0)*widthG].x;
            set.y = temp[iG + (jG+1)*widthG].x;
            set.z = temp[iG + (jG+2)*widthG].x;
            set.w = temp[iG + (jG+3)*widthG].x;
        }
        if(S == 1)
        {
            set.x = temp[iG + (jG-1)*widthG].y;
            set.y = temp[iG + (jG+0)*widthG].y;
            set.z = temp[iG + (jG+1)*widthG].y;
            set.w = temp[iG + (jG+2)*widthG].y;
        }
        if(S == 2)
        {
            set.x = temp[iG + (jG-2)*widthG].z;
            set.y = temp[iG + (jG-1)*widthG].z;
            set.z = temp[iG + (jG+0)*widthG].z;
            set.w = temp[iG + (jG+1)*widthG].z;
        }
        if(S == 3)
        {
            set.x = temp[iG + (jG-3)*widthG].w;
            set.y = temp[iG + (jG-2)*widthG].w;
            set.z = temp[iG + (jG-1)*widthG].w;
            set.w = temp[iG + (jG+0)*widthG].w;
        }
    
    
        location.x = I;
        location.y = (i*4 + S);
    
        write_imagef(dstBuffer, location , set);
    
    }
    

0 个答案:

没有答案