我使用两个1D高斯滤波器(高斯可分离性)在OpenCL上实现了2D高斯滤波器。
我实现了我的卷积的2个版本: - 第一个使用一个内核,在行上应用1D过滤器,然后转换图像(大约需要20毫秒)
我评估了两个实现的计算时间,并且我发现使用两个内核的实现比使用一个内核的实现要快 (注意转置内核必须等待卷积内核)。
你能帮我弄清楚为什么使用一个内核的实现速度较慢,即使一个内核的设置时间应该比两个内核的设置时间快。
在下面找到两种实现的OpenCL源代码:
一个内核实现
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,__constant int4 *par, __constant float *filter,__local float4 *cache, __local float4 *temp,__write_only image2d_t dstBuffer)
{
int width = par[0].x;
int height = par[0].y;
int widthG = par[0].z;
int heightG = par[0].w;
int gaussFilterWidth = par[1].x;
int margin = 1 + (gaussFilterWidth>>2);
int2 location;
float4 focus,set;
float *s = &set;
int m,start;
int widthGM = widthG + (margin<<1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;
int i = get_global_id(0);
int j = get_global_id(1);
int iG = get_local_id(0);
int jG = get_local_id(1);
location.x = i;
location.y = j;
focus = read_imagef(srcBuffer, sampler, location);
cache[(iG+margin) + jG*widthGM] = focus;
if(iG == 0)
{
if(i == 0)
{
cache[0 + jG*widthGM].x = focus.x;
cache[0 + jG*widthGM].y = focus.x;
cache[0 + jG*widthGM].z = focus.x;
cache[0 + jG*widthGM].w = focus.x;
cache[1 + jG*widthGM].x = focus.x;
cache[1 + jG*widthGM].y = focus.x;
cache[1 + jG*widthGM].z = focus.x;
cache[1 + jG*widthGM].w = focus.x;
}
else
{
location.x = i-margin;
location.y = j;
cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location);
location.x = i-(margin-1);
location.y = j;
cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
}
}
if(iG == ((widthGM-1)-(margin<<1)))
{
if(i == (width-1))
{
cache[(widthGM-2) + jG*widthGM].x = focus.w;
cache[(widthGM-2) + jG*widthGM].y = focus.w;
cache[(widthGM-2) + jG*widthGM].z = focus.w;
cache[(widthGM-2) + jG*widthGM].w = focus.w;
cache[(widthGM-1) + jG*widthGM].x = focus.w;
cache[(widthGM-1) + jG*widthGM].y = focus.w;
cache[(widthGM-1) + jG*widthGM].z = focus.w;
cache[(widthGM-1) + jG*widthGM].w = focus.w;
}
else
{
location.x = i+margin;
location.y = j;
cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
location.x = i+(margin-1);
location.y = j;
cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 bar[10],barX[10],barY[10],barZ[10],barW[10];
float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW;
float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW;
float4 gauss[4];
float *gf = &gauss;
float4 acc;
gf[0] = filter[0];
gf[1] = filter[1];
gf[2] = filter[2];
gf[3] = filter[3];
gf[4] = filter[4];
gf[5] = filter[5];
gf[6] = filter[6];
gf[7] = filter[7];
gf[8] = filter[8];
gf[9] = filter[9];
gf[10]= filter[10];
gf[11]= filter[11];
gf[12]= filter[12];
gf[13]= filter[13];
gf[14]= filter[14];
gf[15]= 0.0f;
start = iG + jG*widthGM;
fX[0] = cache[start+0].y;
fX[1] = fY[0] = cache[start+0].z;
fX[2] = fY[1] = fZ[0] = cache[start+0].w;
fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x;
fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y;
fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z;
fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w;
fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x;
fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y;
fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z;
fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w;
fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x;
fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y;
fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z;
fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w;
fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x;
fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y;
fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z;
fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;
acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15];
acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15];
acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15];
acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15];
temp[iG + jG*widthG] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
int I,S;
I = j >> 2;
S = j & 3;
if(S == 0)
{
set.x = temp[iG + (jG+0)*widthG].x;
set.y = temp[iG + (jG+1)*widthG].x;
set.z = temp[iG + (jG+2)*widthG].x;
set.w = temp[iG + (jG+3)*widthG].x;
}
if(S == 1)
{
set.x = temp[iG + (jG-1)*widthG].y;
set.y = temp[iG + (jG+0)*widthG].y;
set.z = temp[iG + (jG+1)*widthG].y;
set.w = temp[iG + (jG+2)*widthG].y;
}
if(S == 2)
{
set.x = temp[iG + (jG-2)*widthG].z;
set.y = temp[iG + (jG-1)*widthG].z;
set.z = temp[iG + (jG+0)*widthG].z;
set.w = temp[iG + (jG+1)*widthG].z;
}
if(S == 3)
{
set.x = temp[iG + (jG-3)*widthG].w;
set.y = temp[iG + (jG-2)*widthG].w;
set.z = temp[iG + (jG-1)*widthG].w;
set.w = temp[iG + (jG+0)*widthG].w;
}
location.x = I;
location.y = (i*4 + S);
write_imagef(dstBuffer, location , set);
}
两个内核实现
卷积:
__kernel void ConvolutionKernel(__read_only image2d_t srcBuffer,
__constant int4 *par,
__constant float *filter,
__local float4 *cache,
__local float4 *temp,
__write_only image2d_t dstBuffer)
{
int width = par[0].x;
int height = par[0].y;
int widthG = par[0].z;
int heightG = par[0].w;
int gaussFilterWidth = par[1].x;
int margin = 1 + (gaussFilterWidth>>2);
int2 location;
float4 focus,set;
float *s = &set;
int m,start;
int widthGM = widthG + (margin<<1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;
int i = get_global_id(0);
int j = get_global_id(1);
int iG = get_local_id(0);
int jG = get_local_id(1);
location.x = i;
location.y = j;
focus = read_imagef(srcBuffer, sampler, location);
cache[(iG+margin) + jG*widthGM] = focus;
if(iG == 0)
{
if(i == 0)
{
cache[0 + jG*widthGM].x = focus.x;
cache[0 + jG*widthGM].y = focus.x;
cache[0 + jG*widthGM].z = focus.x;
cache[0 + jG*widthGM].w = focus.x;
cache[1 + jG*widthGM].x = focus.x;
cache[1 + jG*widthGM].y = focus.x;
cache[1 + jG*widthGM].z = focus.x;
cache[1 + jG*widthGM].w = focus.x;
}
else
{
location.x = i-margin;
location.y = j;
cache[jG*widthGM] = read_imagef(srcBuffer, sampler, location);
location.x = i-(margin-1);
location.y = j;
cache[1 + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
}
}
if(iG == ((widthGM-1)-(margin<<1)))
{
if(i == (width-1))
{
cache[(widthGM-2) + jG*widthGM].x = focus.w;
cache[(widthGM-2) + jG*widthGM].y = focus.w;
cache[(widthGM-2) + jG*widthGM].z = focus.w;
cache[(widthGM-2) + jG*widthGM].w = focus.w;
cache[(widthGM-1) + jG*widthGM].x = focus.w;
cache[(widthGM-1) + jG*widthGM].y = focus.w;
cache[(widthGM-1) + jG*widthGM].z = focus.w;
cache[(widthGM-1) + jG*widthGM].w = focus.w;
}
else
{
location.x = i+margin;
location.y = j;
cache[(widthGM-1) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
location.x = i+(margin-1);
location.y = j;
cache[(widthGM-2) + jG*widthGM] = read_imagef(srcBuffer, sampler, location);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
float4 bar[10],barX[10],barY[10],barZ[10],barW[10];
float4 *p = &bar, *pX = &barX, *pY = &barY, *pZ = &barZ, *pW = &barW;
float *f = &bar, *fX = &barX, *fY = &barY, *fZ = &barZ, *fW = &barW;
float4 gauss[4];
float *gf = &gauss;
float4 acc;
gf[0] = filter[0];
gf[1] = filter[1];
gf[2] = filter[2];
gf[3] = filter[3];
gf[4] = filter[4];
gf[5] = filter[5];
gf[6] = filter[6];
gf[7] = filter[7];
gf[8] = filter[8];
gf[9] = filter[9];
gf[10]= filter[10];
gf[11]= filter[11];
gf[12]= filter[12];
gf[13]= filter[13];
gf[14]= filter[14];
gf[15]= 0.0f;
start = iG + jG*widthGM;
fX[0] = cache[start+0].y;
fX[1] = fY[0] = cache[start+0].z;
fX[2] = fY[1] = fZ[0] = cache[start+0].w;
fX[3] = fY[2] = fZ[1] = fW[0] = cache[start+1].x;
fX[4] = fY[3] = fZ[2] = fW[1] = cache[start+1].y;
fX[5] = fY[4] = fZ[3] = fW[2] = cache[start+1].z;
fX[6] = fY[5] = fZ[4] = fW[3] = cache[start+1].w;
fX[7] = fY[6] = fZ[5] = fW[4] = cache[start+2].x;
fX[8] = fY[7] = fZ[6] = fW[5] = cache[start+2].y;
fX[9] = fY[8] = fZ[7] = fW[6] = cache[start+2].z;
fX[10]= fY[9] = fZ[8] = fW[7] = cache[start+2].w;
fX[11]= fY[10]= fZ[9] = fW[8] = cache[start+3].x;
fX[12]= fY[11]= fZ[10]= fW[9] = cache[start+3].y;
fX[13]= fY[12]= fZ[11]= fW[10]= cache[start+3].z;
fX[14]= fY[13]= fZ[12]= fW[11]= cache[start+3].w;
fX[15]= fY[14]= fZ[13]= fW[12]= cache[start+4].x;
fX[15]= fY[15]= fZ[14]= fW[13]= cache[start+4].y;
fX[15]= fY[15]= fZ[15]= fW[14]= cache[start+4].z;
fX[15]= fY[15]= fZ[15]= fW[15]= 0.0f;
acc.x = fX[0]*gf[0] + fX[1]*gf[1] + fX[2]*gf[2] + fX[3]*gf[3] + fX[4]*gf[4] + fX[5]*gf[5] + fX[6]*gf[6] + fX[7]*gf[7] + fX[8]*gf[8] + fX[9]*gf[9] + fX[10]*gf[10] + fX[11]*gf[11] + fX[12]*gf[12] + fX[13]*gf[13] + fX[14]*gf[14] + fX[15]*gf[15];
acc.y = fY[0]*gf[0] + fY[1]*gf[1] + fY[2]*gf[2] + fY[3]*gf[3] + fY[4]*gf[4] + fY[5]*gf[5] + fY[6]*gf[6] + fY[7]*gf[7] + fY[8]*gf[8] + fY[9]*gf[9] + fY[10]*gf[10] + fY[11]*gf[11] + fY[12]*gf[12] + fY[13]*gf[13] + fY[14]*gf[14] + fY[15]*gf[15];
acc.z = fZ[0]*gf[0] + fZ[1]*gf[1] + fZ[2]*gf[2] + fZ[3]*gf[3] + fZ[4]*gf[4] + fZ[5]*gf[5] + fZ[6]*gf[6] + fZ[7]*gf[7] + fZ[8]*gf[8] + fZ[9]*gf[9] + fZ[10]*gf[10] + fZ[11]*gf[11] + fZ[12]*gf[12] + fZ[13]*gf[13] + fZ[14]*gf[14] + fZ[15]*gf[15];
acc.w = fW[0]*gf[0] + fW[1]*gf[1] + fW[2]*gf[2] + fW[3]*gf[3] + fW[4]*gf[4] + fW[5]*gf[5] + fW[6]*gf[6] + fW[7]*gf[7] + fW[8]*gf[8] + fW[9]*gf[9] + fW[10]*gf[10] + fW[11]*gf[11] + fW[12]*gf[12] + fW[13]*gf[13] + fW[14]*gf[14] + fW[15]*gf[15];
location.x = i;
location.y = j;
write_imagef(dstBuffer, location , acc);
}
移调:
__kernel void TransponseKernel(__read_only image2d_t srcBuffer,
__constant int4 *par,
__local float4 *temp,
__write_only image2d_t dstBuffer)
{
int width = par[0].x;
int height = par[0].y;
int widthG = par[0].z;
int heightG = par[0].w;
int gaussFilterWidth = par[1].x;
int margin = 1 + (gaussFilterWidth>>2);
int widthGM = widthG + (margin<<1);
int2 location;
float4 focus,set;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST;
int i = get_global_id(0);
int j = get_global_id(1);
int iG = get_local_id(0);
int jG = get_local_id(1);
location.x = i;
location.y = j;
focus = read_imagef(srcBuffer, sampler, location);
temp[iG + jG*widthG] = focus;
barrier(CLK_LOCAL_MEM_FENCE);
int I,S;
I = j >> 2;
S = j & 3;
if(S == 0)
{
set.x = temp[iG + (jG+0)*widthG].x;
set.y = temp[iG + (jG+1)*widthG].x;
set.z = temp[iG + (jG+2)*widthG].x;
set.w = temp[iG + (jG+3)*widthG].x;
}
if(S == 1)
{
set.x = temp[iG + (jG-1)*widthG].y;
set.y = temp[iG + (jG+0)*widthG].y;
set.z = temp[iG + (jG+1)*widthG].y;
set.w = temp[iG + (jG+2)*widthG].y;
}
if(S == 2)
{
set.x = temp[iG + (jG-2)*widthG].z;
set.y = temp[iG + (jG-1)*widthG].z;
set.z = temp[iG + (jG+0)*widthG].z;
set.w = temp[iG + (jG+1)*widthG].z;
}
if(S == 3)
{
set.x = temp[iG + (jG-3)*widthG].w;
set.y = temp[iG + (jG-2)*widthG].w;
set.z = temp[iG + (jG-1)*widthG].w;
set.w = temp[iG + (jG+0)*widthG].w;
}
location.x = I;
location.y = (i*4 + S);
write_imagef(dstBuffer, location , set);
}