Question

下面的函数包含嵌套的for循环。其中有3个。为了便于理解，我已经给出了以下全部功能。我想在最里面的for循环中并行化代码，因为它需要最大的CPU时间。然后我可以考虑外部2 for循环。我可以在最里面的for循环中看到依赖项和内部内联函数。可以重写最内层的for循环以使用openmp pragma启用并行化。请告诉我怎么做。我正在编写我首先感兴趣的循环，然后是完整的函数，这个循环存在于referance。

对并行化下面提到的循环感兴趣。

//* LOOP WHICH I WANT TO PARALLELIZE *//

 for (y = 0; y < 4; y++)    
 {
  refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);

  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
 }

此循环存在的完整功能在下面用于referance。 / *！

***********************************************************************

* \brief

* Setup the fast search for an macroblock

***********************************************************************

*/

void SetupFastFullPelSearch (short ref, int list) // <-- reference frame parameter, list0 or 1

{

short pmv[2];
    pel_t orig_blocks[256], *orgptr=orig_blocks, *refptr, *tem; // created pointer tem
    int offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
    int LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
    int max_width, max_height;
    int img_width, img_height;
    StorablePicture *ref_picture;
    pel_t *ref_pic;


int** block_sad = BlockSAD[list][ref][7];
int search_range = max_search_range[list][ref];
int max_pos = (2*search_range+1) * (2*search_range+1);

int list_offset = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;

int apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
(active_pps->weighted_bipred_idc && (img->type == B_SLICE)));

ref_picture = listX[list+list_offset][ref];

//===== Use weighted Reference for ME ====

if (apply_weights && input->UseWeightedReferenceME)

ref_pic = ref_picture->imgY_11_w;

else

ref_pic = ref_picture->imgY_11;


max_width = ref_picture->size_x - 17;
max_height = ref_picture->size_y - 17;
img_width = ref_picture->size_x;
img_height = ref_picture->size_y;

//===== get search center: predictor of 16x16 block =====

SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);

search_center_x[list][ref] = pmv[0] / 4;
search_center_y[list][ref] = pmv[1] / 4;

if (!input->rdopt)
{

//--- correct center so that (0,0) vector is inside ---

search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));

search_center_y[list][ref] = max(-search_range, min(search_range, search_center_y[list][ref]));

}

search_center_x[list][ref] += img->opix_x;
search_center_y[list][ref] += img->opix_y;
offset_x = search_center_x[list][ref];
offset_y = search_center_y[list][ref];

//===== copy original block for fast access =====

for (y = img->opix_y; y < img->opix_y+16; y++)
for (x = img->opix_x; x < img->opix_x+16; x++)

*orgptr++ = imgY_org [y][x];

//===== check if whole search range is inside image =====

if (offset_x >= search_range && offset_x <= max_width - search_range &&

offset_y >= search_range && offset_y <= max_height - search_range )

{
 range_partly_outside = 0; PelYline_11 = FastLine16Y_11;
}

else
{
 range_partly_outside = 1;
}

//===== determine position of (0,0)-vector =====

if (!input->rdopt)

{

ref_x = img->opix_x - offset_x;

ref_y = img->opix_y - offset_y;

for (pos = 0; pos < max_pos; pos++)
{
 if (ref_x == spiral_search_x[pos] &&

ref_y == spiral_search_y[pos])
  {
   pos_00[list][ref] = pos;
   break;
  }

}

  }



//===== loop over search range (spiral search): get blockwise SAD =====
**// =====THIS IS THE PART WHERE NESTED FOR STARTS=====**

for (pos = 0; pos < max_pos; pos++) // OUTERMOST FOR LOOP

{
abs_y = offset_y + spiral_search_y[pos];
abs_x = offset_x + spiral_search_x[pos];

if (range_partly_outside)
{
  if (abs_y >= 0 && abs_y <= max_height && abs_x >= 0 && abs_x <= max_width )
     {
      PelYline_11 = FastLine16Y_11;
     }

  else
     {
      PelYline_11 = UMVLine16Y_11;
     }
 }

orgptr = orig_blocks;
bindex = 0;

for (blky = 0; blky < 4; blky++)    // SECOND FOR LOOP
{
 LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;

   for (y = 0; y < 4; y++)    //INNERMOST FOR LOOP WHICH I WANT TO PARALLELIZE
   {
     refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);

      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
     }

    block_sad[bindex++][pos] = LineSadBlk0;
    block_sad[bindex++][pos] = LineSadBlk1;
    block_sad[bindex++][pos] = LineSadBlk2;
    block_sad[bindex++][pos] = LineSadBlk3;

  }

 }


//===== combine SAD's for larger block types =====

SetupLargerBlocks (list, ref, max_pos);

//===== set flag marking that search setup have been done =====

search_setup_done[list][ref] = 1;

}

#endif // _FAST_FULL_ME_

我重新尝试了代码以尝试解决最内层for循环中的依赖关系，即for（y = 0; y＆lt; 4; y ++）和很多LineSadBlk。请评论是否错误。我认为refptr和orgptr由此排序并且依赖关系得到解决，但LineSadBlk0,1,2,3仍然具有依赖性，就好像我们并行运行第一次和第二次迭代一样，LineSadBlk0,1,2,3的值将是由线程采取。如何解决这个问题。

/*!
***********************************************************************
* \brief
*    Setup the fast search for an macroblock
***********************************************************************
*/
void SetupFastFullPelSearch (short ref, int list)  // <--  reference frame parameter,   list0 or 1
{
 short   pmv[2];
 pel_t   orig_blocks[256];
 //pel_t   *orgptr, *refptr[4];
 pel_t *orgptr[4],*refptr[4];  //defined by me new
 int     offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
 int     LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
 int     max_width, max_height;
 int     img_width, img_height;

 StorablePicture *ref_picture;
 pel_t   *ref_pic;

 int**   block_sad     = BlockSAD[list][ref][7];
 int     search_range  = max_search_range[list][ref];
 int     max_pos       = (2*search_range+1) * (2*search_range+1);

 int     list_offset   = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;

 int     apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
                        (active_pps->weighted_bipred_idc && (img->type == B_SLICE)));


 ref_picture     = listX[list+list_offset][ref];

 //===== Use weighted Reference for ME ====
 if (apply_weights && input->UseWeightedReferenceME)
 ref_pic       = ref_picture->imgY_11_w;
 else
 ref_pic       = ref_picture->imgY_11;

 max_width     = ref_picture->size_x - 17;
 max_height    = ref_picture->size_y - 17;

 img_width     = ref_picture->size_x;
 img_height    = ref_picture->size_y;

 //===== get search center: predictor of 16x16 block =====
 SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);        //call 1
 search_center_x[list][ref] = pmv[0] / 4;
 search_center_y[list][ref] = pmv[1] / 4;

 if (!input->rdopt)
 {
  //--- correct center so that (0,0) vector is inside ---
  search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));
  search_center_y[list][ref] = max(-search_range, min(search_range,    search_center_y[list][ref]));
  }

  search_center_x[list][ref] += img->opix_x;
  search_center_y[list][ref] += img->opix_y;

  offset_x = search_center_x[list][ref];
  offset_y = search_center_y[list][ref];


// orgptr=orig_blocks;
orgptr[0]= orig_blocks          //all org pointers defined orig blocks
orgptr[1]= orig_blocks;
orgptr[2]= orig_blocks;
orgptr[3]= orig_blocks;



   //===== copy original block for fast access =====
  for   (y = img->opix_y; y < img->opix_y+16; y++)
  for (x = img->opix_x; x < img->opix_x+16; x++)
{         
//*orgptr++ = imgY_org [y][x];
*(orgptr[0])++ = imgY_org [y][x];                   // img stored in all orgptr
*(orgptr[1])++ = imgY_org [y][x];
*(orgptr[2])++ = imgY_org [y][x];
*(orgptr[3])++ = imgY_org [y][x];
}

   //===== check if whole search range is inside image =====
   if (offset_x >= search_range && offset_x <= max_width  - search_range &&
  offset_y >= search_range && offset_y <= max_height - search_range   )
   {
    range_partly_outside = 0; PelYline_11 = FastLine16Y_11;     //search range is fully inside image
    }
   else
   {
    range_partly_outside        //search range is partly outside image
    }

     //===== determine position of (0,0)-vector =====
   if (!input->rdopt)
   {
    ref_x = img->opix_x - offset_x;
    ref_y = img->opix_y - offset_y;

    for (pos = 0; pos < max_pos; pos++)
    {
     if (ref_x == spiral_search_x[pos] &&
      ref_y == spiral_search_y[pos])
      {
       pos_00[list][ref] = pos;
       break;
      }
    }
   }

   //===== loop over search range (spiral search): get blockwise SAD =====
  for (pos = 0; pos < max_pos; pos++)
  {
   abs_y = offset_y + spiral_search_y[pos];
   abs_x = offset_x + spiral_search_x[pos];

    if (range_partly_outside)
    {
     if (abs_y >= 0 && abs_y <= max_height &&
      abs_x >= 0 && abs_x <= max_width    )
      {
       PelYline_11 = FastLine16Y_11;            //call 2
      }
      else
      {
       PelYline_11 = UMVLine16Y_11;                     //call 3
      }
     }

    //orgptr=orig_blocks;
      orgptr[0]=orig_blocks;
  orgptr[1]=orgptr[0]+16;
  orgptr[2]=orgptr[1]+16;
  orgptr[3]=orgptr[2]+16;
      bindex = 0;


    for (blky = 0; blky < 4; blky++)
    {
      LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;

       // i added the following to take refptr out of loop
  refptr[0] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);           //call either 2 or 3
  abs_y++;
  refptr[1] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);           //call either 2 or 3
   abs_y++;
  refptr[2] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);           //call either 2 or 3
   abs_y++;
   refptr[3] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);          //call either 2 or 3
   abs_y++;

omp_set_num_threads(4);
#pragma omp parallel for reduction(+:LineSadBlk0,LineSadBlk1,LineSadBlk2,LineSadBlk3)
     for (y = 0; y < 4; y++)
     {

{
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    }


    {
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}


    {
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}


    {
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
  }

  }


  block_sad[bindex++][pos] = LineSadBlk0;
  block_sad[bindex++][pos] = LineSadBlk1;
  block_sad[bindex++][pos] = LineSadBlk2;
  block_sad[bindex++][pos] = LineSadBlk3;

  }
 }





 //===== combine SAD's for larger block types =====
  SetupLargerBlocks (list, ref, max_pos);                          //call4


  //===== set flag marking that search setup have been done =====
  search_setup_done[list][ref] = 1;
  }
  #endif // _FAST_FULL_ME_

使用openmp并行化代码

0 个答案: