Question

我正在尝试从GPU的this代码优化IDCT。我在 NVIDIA Tesla k20c 的系统上使用的GPU。

原始代码中编写的IDCT函数如下所示：

void IDCT(int32_t *input, uint8_t *output) {
    int32_t Y[64];
    int32_t k, l;

    for (k = 0; k < 8; k++) {
        for (l = 0; l < 8; l++) Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
        idct_1d(&Y(k, 0));
    }

    for (l = 0; l < 8; l++) {
        int32_t Yc[8];

        for (k = 0; k < 8; k++) Yc[k] = Y(k, l);

        idct_1d(Yc);

        for (k = 0; k < 8; k++) {
            int32_t r = 128 + DESCALE(Yc[k], S_BITS + 3);
            r = r > 0 ? (r < 255 ? r : 255) : 0;
            X(k, l) = r;
        }
    }
}

有关.c文件的更多详细信息，请参阅this链接。

以下列方式从main.c调用此函数：

 for (index_X = 0; index_X < nb_MCU_X; index_X++) {

for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

for (index = 0; index < SOS_section.n; index++) {

 uint32_t component_index = component_order[index];

int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {

 //unpack block function
 //iqzz function

IDCT(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));

 }
//other function                    
 }
//Code continues...

main.c的详细信息可以在this链接中找到。

我通过以下方式从IDCT函数中创建了一个内核：

__kernel void IDCT(__global int* input, __global uchar* output) 
{
 unsigned int kid= get_global_id(0);

 int Y[64]; 
 int k,l;
 int Yc[8];

 for (k = 0; k < 8; k++)
 {
  for (l = 0; l < 8; l++)
  {
   Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);     
  }         
 idct_1D(&Y(k,0));
 }

for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}

idct_1D(Yc);

for (k = 0; k < 8; k++)
{

int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}

}

}

我以这种方式从main.c调用此内核：

    for (index_X = 0; index_X < nb_MCU_X; index_X++) {

    for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

    for (index = 0; index < SOS_section.n; index++) {

     uint32_t component_index = component_order[index];

     int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

     for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {

    cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);

    //Output buffer
    cl_mem  DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
    chk(ret, "clCreateBuffer");

    ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
    ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);

//Timing-Start..

  start_time = wtime();

  size_t globalForInverseDCT= 1024;

  size_t localForInverseDCT= 256;

 ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 1, NULL, &globalForInverseDCT, &localForInverseDCT, 0, NULL, NULL);

//Timing-End..

 run_time = wtime() - start_time;

 ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);

 }

 //other function

 }

 //code continues...

我正在使用此功能来计时IDCT：

double wtime()
{

   /* Use a generic timer */
   static int sec = -1;
   struct timeval tv;
   gettimeofday(&tv, NULL);
   if (sec < 0) sec = tv.tv_sec;
   return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;

}

使用我的计时方法在 CPU 上执行IDCT的时间 5微秒。 GPU 上的执行时间 20微秒。（我也使用了OpenCL的内置分析命令，它给了我31微秒的执行时间。）

如何修改我的代码以便在GPU上运行得更快？

我的工作：

我尝试以this的方式将内核分解为较小的内核，但我的屏幕输出会变为静态。

我再次对代码进行了分析，得到了以下输出：

根据Nvidia的Visual Profiler，我的IDCT内核的执行时间 17.6微秒（大约）。

修改

我使用命令行分析器再次分析了我的代码，idct的执行时间现在 4.9微秒。

请建议我如何进一步优化我的内核。

如何让我的GPU上的IDCT运行得更快？

0 个答案: