Question

这个问题来自this question。 从那时起，我一直在研究我的IDCT内核，以提高效率。

这个版本的内核产生正确的输出，但速度很慢：

void idct_1D(__local int *Y);

 __kernel void IDCT(__global int* input, __global uchar* output) 
{
 unsigned int kid= get_global_id(0);

 __local int Y[64]; 
 int k,l;
 __local int Yc[8];

 for (k = 0; k < 8; k++)
 {
  for (l = 0; l < 8; l++)
  {
   Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);     
  }         
 idct_1D(&Y(k,0));
 }

for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}

idct_1D(Yc);

for (k = 0; k < 8; k++)
{

int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}

}

}

我试图通过这种方式修改它的结构来使这个内核更加平行：

__kernel void IDCT(__global int* input, __global uchar* output) 
{
unsigned int kid= get_global_id(0);

    __local int Y[64]; 
    int k= get_global_id(0);
    int l;
    int lid= get_global_id(1);
    __local int Yc[8];

   if (k < 8)
    {
        for (l = 0; l < 8; l++) 

     {
     Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
     }
        idct_1D(&Y(k, 0));
    }

    if (lid < 8)
    {

        for (k = 0; k < 8; k++)
    {
            Yc[k] = Y(k, lid);
    }

        idct_1D(Yc);

        for (k = 0; k < 8; k++)
        {
            int r = 128 + DESCALE(Yc[k], S_BITS + 3);
            r = r > 0 ? (r < 255 ? r : 255) : 0;
            X(k, lid) = r;
        }

    }
}

上面的内核给了我正确的输出，但我发现代码的处理速度没有变化。

我调用代码的main.c如下所示：

  for (index_X = 0; index_X < nb_MCU_X; index_X++) {

    for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

    for (index = 0; index < SOS_section.n; index++) {

     uint32_t component_index = component_order[index];

     int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

     for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {

    cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);

    //Output buffer
    cl_mem  DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
    chk(ret, "clCreateBuffer");

    ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
    ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);


const size_t globalForInverseDCT[2]= {8, 8};



 ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 2, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);

//Timing-End..

 ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);

 }

 //other function

 }

 //code continues...

如何进一步优化此内核？

编辑：

理想情况下，我希望将IDCT分解为更小的内核。感谢我在this question中收到的帮助，常规功能被分解为更小的子功能，如下所示：

void IDCTforX(int32_t *input, uint8_t *output) {

    int32_t Y[64];
    int32_t k, l;
    int32_t Yc[8];

    for (k = 0; k < 8; k++) {
        for (l = 0; l < 8; l++)
        {
            Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
        }
    }
}

void IDCTfor1dim(int32_t *input, uint8_t *output)
{
int32_t Y[64];
    int32_t k, l;
    int32_t Yc[8];

    for (k= 0; k < 8; k++)
    {
        idct_1d(&Y(k, 0));
    }
}

在Y方向：

void IDCTforY(int32_t *input, uint8_t *output) 
{
  int32_t Y[64];
  int32_t k, l;
  int32_t Yc[8][8];

  for (l = 0; l < 8; l++) 
  {
      for (k = 0; k < 8; k++)
          Yc[l][k] = Y(k, l);
      idct_1d(Yc[l]);
   }

void IDCT_DescaleY(int32_t *input, uint8_t *output) 
{
  int32_t Y[64];
  int32_t k, l;
  int32_t Yc[8][8];

for (l = 0; l < 8; l++) 
   {
       for (k = 0; k < 8; k++) 
       {   
           int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
           r = r > 0 ? (r < 255 ? r : 255) : 0;
           X(k, l) = r;
       }
   }
}

在main.c中，当我按以下顺序调用函数（代替原始函数）时，我得到了正确的输出：

IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
IDCT_DescaleY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));

我将这些功能翻译成了内核，但我只是一个灰色的屏幕。我的新内核看起来像这样：

/*---------------IDCTForX----------------------------*/

__kernel void IDCTforX(__global int *input, __global uchar *output) {

    int Y[64];
    unsigned int k= get_global_id(0);
    unsigned int l= get_global_id(1);

        if ((k < 8) && (l < 8))
            {
                Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);

            }

}

/*---------------IDCTfor1Dim-------------------------*/

__kernel void IDCTfor1dim(__global int *input, __global uchar *output){

int Y[64];
unsigned int k= get_global_id(0);

    for (k= 0; k < 8; k++)
    {
        idct_1D(&Y(k, 0));
    }

}

/*---------------IDCTYSplit----------------------------*/

__kernel void IDCTYSplit(__global int *input, __global uchar *output) {

int Y[64];
      int k= get_global_id(0);
      int l= get_global_id(1);
      int Yc[8][8];

      if ((k < 8) && (l < 8))
      {
              Yc[l][k] = Y(k, l);

      }
}

/*---------------IDCTY_Inverse----------------------------*/

__kernel void IDCTY_Inverse(__global int *input, __global uchar *output) {

int Y[64];
      int l= get_global_id(0);
      int Yc[8][8];

 if (l < 8)
          {
          idct_1D(Yc[l]);
       }
}

/*---------------IDCTY_Descale----------------------------*/

__kernel void IDCTY_Descale(__global int *input, __global uchar *output) {

int Y[64];

    int l= get_global_id(0);

    int k= get_global_id(1);

      int Yc[8][8];

       if ((l < 8) && (k < 8))
           {
               int r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
               r = r > 0 ? (r < 255 ? r : 255) : 0;
               X(k, l) = r;
           }

}

Answer 1

仅64个操作和64个项缓冲区的顺序太小。刚开始向/从GPU传递数据的开销一直在花费。你想要的东西不仅可以并行化，而且每个GPU内核需要数百万到数十亿的操作。

如何针对GPU优化我的OpenCL内核？

1 个答案: