Question

我写了一个OpenCL内核，它在输入矩阵上执行盒子模糊。该实现最初是为GPU编写的，并使用本地内存来存储工作组中的工作项邻域。然后，我在CPU上运行内核，并将运行时间与依赖缓存从全局内存中自动读取的实现进行比较，而不是先将它们手动存储在本地内存中。

假设CPU没有＆＃34;本地内存＆＃34;而使用RAM，在CPU上使用本地内存应该弊大于利。然而，＆＃34;本地记忆＆＃34;内核更快比依赖缓存10ms的那个（在工作项目/工作组/＆＃34的8192x8192矩阵上，~112ms与~122ms相比;每个工作项计算的值的数量＆＃34; 34;设置被认为对两种实现都是最佳的，因为它们是由两个内核的自动调谐器分别找到的。）

内核使用主机上提供的OpenCL intel平台在 Intel Xeon E5-1620 v2 CPU上运行。

发生这种情况的原因是什么？？

＆＃34;本地内存＆＃34; 内核：每个工作项都适用于＆＃34;块＆＃34;价值观每个块都复制到共享内存，并且其邻居被复制到本地内存，具体取决于块在工作组中的位置，因此不会复制值两次。然后，在屏障之后，计算最终值。
下面的代码是X方向内核;除了检查值以计算输出值的方向外，y方向内核完全相同。

__kernel void boxblur_x (__read_only __global float* image,
                   __local float* localmem,
                   __write_only __global float* output)
{
// size of input and output matrix
int MATRIX_SIZE_Y = IMAGE_HEIGHT;
int MATRIX_SIZE_X = IMAGE_WIDTH;
int MATRIX_SIZE   = MATRIX_SIZE_Y  * MATRIX_SIZE_X;

// mask size
int S_L = MASK_SIZE_LEFT;
int S_U = 0;
int S_R = MASK_SIZE_RIGHT;
int S_D = 0;
int SHAPE_SIZE_Y = S_U + S_D + 1;
int SHAPE_SIZE_X = S_L + S_R + 1;
int SHAPE_SIZE = SHAPE_SIZE_Y * SHAPE_SIZE_X;

// tuning parameter
// ---------------------------------------------------------------
//work items in y/x dimension per work group
int NUM_WI_Y = get_local_size(1);
int NUM_WI_X = get_local_size(0);

//size of blocks
int BLOCKHEIGHT = X_BLOCKHEIGHT;
int BLOCKWIDTH = X_BLOCKWIDTH;

//position in matrix
int GLOBAL_POS_X = get_global_id(0) * BLOCKWIDTH;
int GLOBAL_POS_Y = get_global_id(1) * BLOCKHEIGHT;

//localMemory size
int LOCALMEM_WIDTH = S_L + NUM_WI_X * BLOCKWIDTH + S_R;

//position in localmem
int LOCAL_POS_X = S_L + get_local_id(0) * BLOCKWIDTH;
int LOCAL_POS_Y = S_U + get_local_id(1) * BLOCKHEIGHT;


// copy values to shared memory
for (int i = 0; i < BLOCKHEIGHT; i++)
{
    for (int j = 0; j < BLOCKWIDTH; j++)
    {
        localmem[(LOCAL_POS_X + j) + (LOCAL_POS_Y + i) * LOCALMEM_WIDTH] = image[GLOBAL_POS_X + j + (GLOBAL_POS_Y + i) * MATRIX_SIZE_X];
    }
}

// only when all work items have arrived here,
// computation continues - otherwise, not all needed
// values might be available in local memory
barrier (CLK_LOCAL_MEM_FENCE);


for (int i = 0; i < BLOCKHEIGHT; i++)
{
    for (int j = 0; j < BLOCKWIDTH; j++)
    {
        float sum = 0;
        for (int b = 0; b <= S_L + S_R; b++)
        {
            sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];
        }

        // divide by size of mask
        float pixelValue = sum / SHAPE_SIZE;

        // write new pixel value to output image
        output[GLOBAL_POS_X + j + ((GLOBAL_POS_Y + i) * get_global_size(0) * BLOCKWIDTH)] = pixelValue;
    }
}
}

＆＃34; L1缓存内核＆＃34;：尽管有许多定义，但它完全相同，但依赖于块的全局内存缓存，而不是显式管理本地内存。

#define WG_BLOCK_SIZE_Y ( OUTPUT_SIZE_Y / NUM_WG_Y )
#define WG_BLOCK_SIZE_X ( OUTPUT_SIZE_X / NUM_WG_X )

#define WI_BLOCK_SIZE_Y ( WG_BLOCK_SIZE_Y / NUM_WI_Y )
#define WI_BLOCK_SIZE_X ( WG_BLOCK_SIZE_X / NUM_WI_X )

#define WG_BLOCK_OFFSET_Y ( WG_BLOCK_SIZE_Y * WG_ID_Y )
#define WG_BLOCK_OFFSET_X ( WG_BLOCK_SIZE_X * WG_ID_X )

#define WI_BLOCK_OFFSET_Y ( WI_BLOCK_SIZE_Y * WI_ID_Y )
#define WI_BLOCK_OFFSET_X ( WI_BLOCK_SIZE_X * WI_ID_X )

#define NUM_CACHE_BLOCKS_Y ( WI_BLOCK_SIZE_Y / CACHE_BLOCK_SIZE_Y )
#define NUM_CACHE_BLOCKS_X ( WI_BLOCK_SIZE_X / CACHE_BLOCK_SIZE_X )

#define CACHE_BLOCK_OFFSET_Y ( CACHE_BLOCK_SIZE_Y * ii )
#define CACHE_BLOCK_OFFSET_X ( CACHE_BLOCK_SIZE_X * jj )

#define reorder(j)     ( ( (j) / WI_BLOCK_SIZE_X) + ( (j) % WI_BLOCK_SIZE_X) * NUM_WI_X )
#define reorder_inv(j) reorder(j)

#define view( i, j, x, y )    input[ ((i) + (x)) * INPUT_SIZE_X + ((j) + (y)) ]
 #define a_wg( i, j, x, y )    view(  WG_BLOCK_OFFSET_Y   + (i), WG_BLOCK_OFFSET_X    + reorder(j), (x), (y) )
 #define a_wi( i, j, x, y )    a_wg(  WI_BLOCK_OFFSET_Y   + (i), WI_BLOCK_OFFSET_X    + (j)       , (x), (y) )
 #define a_cache( i, j, x, y ) a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j)       , (x), (y) )


 #define res_wg( i, j ) output[ (WG_BLOCK_OFFSET_Y + i) * OUTPUT_SIZE_X + WG_BLOCK_OFFSET_X + reorder_inv(j) ]

 #define res(i, j)         output[ (i) * OUTPUT_SIZE_X + (j) ]
 #define res_wg( i, j )    res(    WG_BLOCK_OFFSET_Y + (i)   , WG_BLOCK_OFFSET_X    + reorder_inv(j) )
 #define res_wi( i, j )    res_wg( WI_BLOCK_OFFSET_Y + (i)   , WI_BLOCK_OFFSET_X    + (j)            )
 #define res_cache( i, j ) res_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j)            )


float f_stencil( __global float* input, int ii, int jj, int i, int j )
{
  // indices
  const int WG_ID_X = get_group_id(0);
  const int WG_ID_Y = get_group_id(1);

  const int WI_ID_X = get_local_id(0);
  const int WI_ID_Y = get_local_id(1);

  // computation
  float sum = 0;
  for( int y = 0 ; y < SHAPE_SIZE_Y ; ++y )
    for( int x = 0 ; x < SHAPE_SIZE_X ; ++x)
       sum += a_cache(i, j, y, x);

  return sum / SHAPE_SIZE;
}

__kernel void stencil( __global float* input,
                       __global float* output
                     )
{

  //indices
  const int WG_ID_X = get_group_id(0);
  const int WG_ID_Y = get_group_id(1);

  const int WI_ID_X = get_local_id(0);
  const int WI_ID_Y = get_local_id(1);

  // iteration over cache blocks
  for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
    for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
      // iteration within a cache block
      for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
        for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
          res_cache( i, j ) = f_stencil( input, ii, jj, i , j );
}

Answer 1

组合“L1缓存”版本的循环时：

for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
 for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
  for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
   for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
     for( int y = 0 ; y < SHAPE_SIZE_Y(SU+SD+1) ; ++y )
       for( int x = 0 ; x < SHAPE_SIZE_X(SL+SR+1) ; ++x)
              ....  += a_cache(i, j, y, x);

和“本地”版本：

for (int i = 0; i < BLOCKHEIGHT; i++)
    for (int j = 0; j < BLOCKWIDTH; j++)
        for (int b = 0; b <= S_L + S_R; b++)
            ... +=input[...]

“a_cache”有很多计算

a_cache（i，j，y，x）;

变为

a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j), x, y )

然后变成

view(  WG_BLOCK_OFFSET_Y   + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X    + reorder(CACHE_BLOCK_OFFSET_X + (j)), (x), (y) )

然后变成

view(  WG_BLOCK_OFFSET_Y   + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X    + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X )

，（x），（y））

然后变成

 input[ ((WG_BLOCK_OFFSET_Y   + (CACHE_BLOCK_OFFSET_Y + (i))) + (x)) * INPUT_SIZE_X + ((WG_BLOCK_OFFSET_X    + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X) + (y)) ]

这是9次加法+2次乘法+ 1次模+ 1次除法。

“本地”版本

 sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];

这是4次加法+3次乘法但没有模数和没有除法。

“L1缓存”版本需要保留6个循环的循环计数器，它们可能使用更多的cpu寄存器甚至L1缓存。数据高速缓存大小为每核128 kB或每线程64 kB。如果你每个核心启动1024个线程（每个核心是一个工作组吗？）那么只需要循环计数器1024 * 6 * 4 = 24kB L1。这使得40kB使用。当你添加“const int WG_ID_X”和其他变量（其中5个）时，只剩下20kB。现在为其参数添加“f_stencil”函数临时“堆栈”变量，可能没有剩余的L1缓存，从而降低了效率。 “本地”版本使用了大约10-12个变量（未使用的变量可能优化了？）并且没有函数，所以对L1来说可能更好。

https://software.intel.com/en-us/node/540486

说

为了减少维护工作组的开销，您应该创建尽可能大的工作组，这意味着64个或更多工作项。 一个上限是所访问数据集的大小 更好不超过单个作品中L1缓存的大小 基。

和

如果你的内核代码包含了barrier指令，那么问题就出现了工作组规模成为权衡。本地和私人记忆越多工作组中的每个工作项都需要，最优化的越小工作组规模是。原因是障碍也会发出副本有关使用的私人和本地内存总量的说明 自状态以来工作组中工作组中的所有工作项到达屏障的每个工作项都会保存，然后再继续与另一个工作项目。

在“本地”版本中只有1个屏障，在此之前，使用了8个变量，因此复制所需的内存不多？

OpenCL：本地内存比CPU上的L1缓存更快？

1 个答案: