
时间:2018-01-28 18:32:19

标签: c opencl

对于IDCT,我有以下四个C函数,这要归功于前一个问题的help I received。我希望将以下所有函数转换为OpenCL内核(并在GPU上运行它们),但我很难为第一个函数本身获得所需的响应。


void IDCTforX(int32_t *input, uint8_t *output, int32_t input_inter[64]) {
        int32_t Y[64];
            int32_t k, l;
            int32_t Yc[8];
        for (k = 0; k < 8; k++) {
            for (l = 0; l < 8; l++)
                Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);
                input_inter[(k<<3) + l]= Y(k,l);




void IDCTfor1dim(int32_t *input, uint8_t *output, int32_t input_inter[64])
    int32_t Y[64];
    input_inter= Y;
    int32_t k, l;
    int32_t Yc[8];

    for (k= 0; k < 8; k++)
        idct_1d(&input_inter[(k << 3) + 0]);



void IDCTforY(int32_t *input, uint8_t *output, int32_t output_inter[8]) {

    int32_t Y[64];
      int32_t k, l;
      int32_t Yc[8][8];

      for (l = 0; l < 8; l++)
          for (k = 0; k < 8; k++)
              Yc[l][k] = Y(k, l);


              for (k = 0; k < 8; k++)
                  (output_inter[k]) = Y(k,l);

      for (l = 0; l < 8; l++)
          output_inter[k]= Yc[l][0];



void IDCT_Descale(int32_t *input, uint8_t *output, int32_t output_inter[8]) {

    int32_t Y[64];
    int32_t k, l;
    int32_t Yc[8][8];

//Running the loop for de-scaling separately....
    for (l = 0; l < 8; l++)
               for (k = 0; k < 8; k++)
                   output_inter[k]= Yc[l][k];

       for (l = 0; l < 8; l++)
           for (k = 0; k < 8; k++)
               int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
               r = r > 0 ? (r < 255 ? r : 255) : 0;
               X(k, l) = r;




__kernel void IDCTForX(__global int *input, __global uchar *output, __global int Yin[64]) 

 int Y[64];
 unsigned int lin;
 unsigned int k= get_global_id(0);
 //unsigned int l= get_global_id(1);

        if (k < 8)
      for (lin=0; lin < 8; lin++)      
                Y(k, lin) = SCALE(input[(k << 3) + lin], S_BITS);
               Yin[(k <<3) + lin]= Y(k, lin);




cl_uchar *YCbCr_MCU_ds[3] = { NULL, NULL, NULL};
cl_int unZZ_MCU[64];
cl_int idct_out[8];

//For the third variable
cl_int Inter_DCT[64];

cosX_kernel= clCreateKernel(program, "IDCTForX", &ret);

cl_mem  DCT_Intermediate = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_int), NULL, &ret);

for (index_X = 0; index_X < nb_MCU_X; index_X++) {

  for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

    for (index = 0; index < SOS_section.n; index++) {

        int component_index = component_order[index];

        int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

        for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++){

         //unpack_block and iqzz...

            /* Set OpenCL kernel arguments */
            ret = clSetKernelArg(cosX_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
            ret |= clSetKernelArg(cosX_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
            ret |= clSetKernelArg(cosX_kernel, 2, sizeof(cl_mem), (void *)&DCT_Intermediate);

            size_t globalForInverseDCT = 8;

            ret = clEnqueueNDRangeKernel(command_queue, cosX_kernel, 1, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);

            ret = clEnqueueReadBuffer(command_queue, DCT_Intermediate, CL_TRUE, 0, 64 * sizeof(cl_int), &Inter_DCT, 0, NULL, NULL);

            //Executing the other functions normally...

            IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), Inter_DCT);
            IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
            IDCT_Descale(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out); 

    //Calling the up-sampling function........





for (index_X = 0; index_X < nb_MCU_X; index_X++) {
          for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
            for (index = 0; index < SOS_section.n; index++) {
              uint32_t component_index = component_order[index];
              int nb_MCU = ((SOF_component[component_index].HV >> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
              for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
                //unpack block and iqzz...

                 IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 *chroma_ss), idct_intermediate);
                IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_intermediate);
                IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
              IDCT_Descale(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);





0 个答案:
