Question

对于IDCT，我有以下四个C函数，这要归功于前一个问题的help I received。我希望将以下所有函数转换为OpenCL内核（并在GPU上运行它们），但我很难为第一个函数本身获得所需的响应。

功能1：

void IDCTforX(int32_t *input, uint8_t *output, int32_t input_inter[64]) {
        int32_t Y[64];
            int32_t k, l;
            int32_t Yc[8];
        for (k = 0; k < 8; k++) {
            for (l = 0; l < 8; l++)
            {
                Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);
                input_inter[(k<<3) + l]= Y(k,l);

            }
        }

}

功能2：

void IDCTfor1dim(int32_t *input, uint8_t *output, int32_t input_inter[64])
{
    int32_t Y[64];
    input_inter= Y;
    int32_t k, l;
    int32_t Yc[8];

    for (k= 0; k < 8; k++)
    {
        idct_1d(&input_inter[(k << 3) + 0]);
    }


}

功能3：

void IDCTforY(int32_t *input, uint8_t *output, int32_t output_inter[8]) {

    int32_t Y[64];
      int32_t k, l;
      int32_t Yc[8][8];

      for (l = 0; l < 8; l++)
      {
          for (k = 0; k < 8; k++)
              Yc[l][k] = Y(k, l);

      }


              for (k = 0; k < 8; k++)
                  (output_inter[k]) = Y(k,l);


      for (l = 0; l < 8; l++)
      {
          idct_1d(Yc[l]);
          output_inter[k]= Yc[l][0];
      }

}

功能4：

void IDCT_Descale(int32_t *input, uint8_t *output, int32_t output_inter[8]) {

    int32_t Y[64];
    int32_t k, l;
    int32_t Yc[8][8];

//Running the loop for de-scaling separately....
    for (l = 0; l < 8; l++)
           {
               for (k = 0; k < 8; k++)
               {
                   output_inter[k]= Yc[l][k];
               }
           }


       for (l = 0; l < 8; l++)
       {
           for (k = 0; k < 8; k++)
           {
               int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
               r = r > 0 ? (r < 255 ? r : 255) : 0;
               X(k, l) = r;
           }
       }

}

对于函数1和2，我包含了一个名为input_inter[64]的第三个变量，因为我想设置要更改为此变量的变量，以便更改的变量可以从一个函数传递到另一个函数。

我对函数1的OpenCL代码如下所示：

__kernel void IDCTForX(__global int *input, __global uchar *output, __global int Yin[64]) 
{

 int Y[64];
 unsigned int lin;
 unsigned int k= get_global_id(0);
 //unsigned int l= get_global_id(1);

        if (k < 8)
  {
      for (lin=0; lin < 8; lin++)      
            {
                Y(k, lin) = SCALE(input[(k << 3) + lin], S_BITS);
               Yin[(k <<3) + lin]= Y(k, lin);

            }          
   }
}

注意：我已将程序简化为仅关注与我的问题相关的部分。

我从我的.c代码中调用此内核，如下所示：

cl_uchar *YCbCr_MCU_ds[3] = { NULL, NULL, NULL};
cl_int unZZ_MCU[64];
cl_int idct_out[8];

//For the third variable
cl_int Inter_DCT[64];

cosX_kernel= clCreateKernel(program, "IDCTForX", &ret);

cl_mem  DCT_Intermediate = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_int), NULL, &ret);

for (index_X = 0; index_X < nb_MCU_X; index_X++) {

  for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

    for (index = 0; index < SOS_section.n; index++) {

        int component_index = component_order[index];

        int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

        for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++){

         //unpack_block and iqzz...

            /* Set OpenCL kernel arguments */
            ret = clSetKernelArg(cosX_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
            ret |= clSetKernelArg(cosX_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
            ret |= clSetKernelArg(cosX_kernel, 2, sizeof(cl_mem), (void *)&DCT_Intermediate);

            size_t globalForInverseDCT = 8;

            ret = clEnqueueNDRangeKernel(command_queue, cosX_kernel, 1, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);

            ret = clEnqueueReadBuffer(command_queue, DCT_Intermediate, CL_TRUE, 0, 64 * sizeof(cl_int), &Inter_DCT, 0, NULL, NULL);

            //Executing the other functions normally...

            IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), Inter_DCT);
            IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
            IDCT_Descale(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out); 
     }

    //Calling the up-sampling function........
    }

     //Color-conversion......

     screen_cpyrect();
     }

我希望这段代码能给我正确的输出，但我得不到正确的输出。我调试了我的代码，发现IDCTforX中的第三个变量的变化方式与IDCTforX的常规C代码相同。

在我的C代码中，我正常调用该函数，并且输出正确：

for (index_X = 0; index_X < nb_MCU_X; index_X++) {
          for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
            for (index = 0; index < SOS_section.n; index++) {
              uint32_t component_index = component_order[index];
              int nb_MCU = ((SOF_component[component_index].HV >> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
              for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
                //unpack block and iqzz...

                 IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 *chroma_ss), idct_intermediate);
                IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_intermediate);
                IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
              IDCT_Descale(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
              }

              upsampler();
            }


              YCbCr_to_ARGB();


            screen_cpyrect();
          }
        }

为什么我的OpenCL代码的行为方式与我的C代码不同？我的IDCTforX内核代码有什么问题？

如何将此IDCT函数转换为OpenCL内核？

0 个答案: