这个问题来自this question。 从那时起,我一直在研究我的IDCT内核,以提高效率。
这个版本的内核产生正确的输出,但速度很慢:
void idct_1D(__local int *Y);
__kernel void IDCT(__global int* input, __global uchar* output)
{
unsigned int kid= get_global_id(0);
__local int Y[64];
int k,l;
__local int Yc[8];
for (k = 0; k < 8; k++)
{
for (l = 0; l < 8; l++)
{
Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);
}
idct_1D(&Y(k,0));
}
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}
idct_1D(Yc);
for (k = 0; k < 8; k++)
{
int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
我试图通过这种方式修改它的结构来使这个内核更加平行:
__kernel void IDCT(__global int* input, __global uchar* output)
{
unsigned int kid= get_global_id(0);
__local int Y[64];
int k= get_global_id(0);
int l;
int lid= get_global_id(1);
__local int Yc[8];
if (k < 8)
{
for (l = 0; l < 8; l++)
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
idct_1D(&Y(k, 0));
}
if (lid < 8)
{
for (k = 0; k < 8; k++)
{
Yc[k] = Y(k, lid);
}
idct_1D(Yc);
for (k = 0; k < 8; k++)
{
int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, lid) = r;
}
}
}
上面的内核给了我正确的输出,但我发现代码的处理速度没有变化。
我调用代码的main.c
如下所示:
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++) {
uint32_t component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);
//Output buffer
cl_mem DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
chk(ret, "clCreateBuffer");
ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
const size_t globalForInverseDCT[2]= {8, 8};
ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 2, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);
//Timing-End..
ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);
}
//other function
}
//code continues...
如何进一步优化此内核?
编辑:
理想情况下,我希望将IDCT
分解为更小的内核。感谢我在this question中收到的帮助,常规功能被分解为更小的子功能,如下所示:
void IDCTforX(int32_t *input, uint8_t *output) {
int32_t Y[64];
int32_t k, l;
int32_t Yc[8];
for (k = 0; k < 8; k++) {
for (l = 0; l < 8; l++)
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
}
}
void IDCTfor1dim(int32_t *input, uint8_t *output)
{
int32_t Y[64];
int32_t k, l;
int32_t Yc[8];
for (k= 0; k < 8; k++)
{
idct_1d(&Y(k, 0));
}
}
在Y方向:
void IDCTforY(int32_t *input, uint8_t *output)
{
int32_t Y[64];
int32_t k, l;
int32_t Yc[8][8];
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
Yc[l][k] = Y(k, l);
idct_1d(Yc[l]);
}
void IDCT_DescaleY(int32_t *input, uint8_t *output)
{
int32_t Y[64];
int32_t k, l;
int32_t Yc[8][8];
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{
int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
在main.c
中,当我按以下顺序调用函数(代替原始函数)时,我得到了正确的输出:
IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCT_DescaleY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
我将这些功能翻译成了内核,但我只是一个灰色的屏幕。我的新内核看起来像这样:
/*---------------IDCTForX----------------------------*/
__kernel void IDCTforX(__global int *input, __global uchar *output) {
int Y[64];
unsigned int k= get_global_id(0);
unsigned int l= get_global_id(1);
if ((k < 8) && (l < 8))
{
Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
}
}
/*---------------IDCTfor1Dim-------------------------*/
__kernel void IDCTfor1dim(__global int *input, __global uchar *output){
int Y[64];
unsigned int k= get_global_id(0);
for (k= 0; k < 8; k++)
{
idct_1D(&Y(k, 0));
}
}
/*---------------IDCTYSplit----------------------------*/
__kernel void IDCTYSplit(__global int *input, __global uchar *output) {
int Y[64];
int k= get_global_id(0);
int l= get_global_id(1);
int Yc[8][8];
if ((k < 8) && (l < 8))
{
Yc[l][k] = Y(k, l);
}
}
/*---------------IDCTY_Inverse----------------------------*/
__kernel void IDCTY_Inverse(__global int *input, __global uchar *output) {
int Y[64];
int l= get_global_id(0);
int Yc[8][8];
if (l < 8)
{
idct_1D(Yc[l]);
}
}
/*---------------IDCTY_Descale----------------------------*/
__kernel void IDCTY_Descale(__global int *input, __global uchar *output) {
int Y[64];
int l= get_global_id(0);
int k= get_global_id(1);
int Yc[8][8];
if ((l < 8) && (k < 8))
{
int r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
答案 0 :(得分:0)
仅64个操作和64个项缓冲区的顺序太小。刚开始向/从GPU传递数据的开销一直在花费。你想要的东西不仅可以并行化,而且每个GPU内核需要数百万到数十亿的操作。