Question

我有一个 cuda内核，可以同时计算某些图像的形态侵蚀。

我使用存储所有图像的缓冲区将图像传递给内核。算法运行正常，但我无法理解为什么共享内存的算法版本比使用全局内存的算法版本慢。

所以我读到了shared memory bank conflict，据我所知，我认为我的算法不是银行冲突安全。

正确？我怎样才能使我的算法银行冲突安全？我怎样才能提高它的性能？

这是我的成功：

#define STREL_W 5
#define STREL_H 5

#define STREL_SIZE 5


#define TILE_W 16
#define TILE_H 16

#define R (STREL_SIZE/2)
#define KERNEL_RADIUS (STREL_SIZE/2)


#define BLOCK_W (TILE_W+(2*R))
#define BLOCK_H (TILE_H+(2*R))

使用全局内存的

版本（理论上比较慢）：

// use gloabal memory
__global__ void erode_multiple_img(unsigned char * buffer_in,
                            unsigned char * buffer_out,
                            int w,int h ){



int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int plane = blockIdx.z * blockDim.z +threadIdx.z;

int index = (h * w * plane) + (row ) * w + col ;



int min_value = 255;
if( (col >= R) && (col < w-R) && (row >= R) && (row < h-R)){
    // pixels <= the boder-R ---> che if them must be eroded
    for(int dy=-STREL_H/2; dy<=STREL_H/2; dy++){
        for (int dx = -STREL_W/2 ; dx <= STREL_W/2; dx++) {
            min_value = min( buffer_in[index + (dy * w ) + (dx)], min_value);
        }
    }
    buffer_out[index]= min_value;
}else{
    //pixels > border-R ---> (must be eroded)
    buffer_out[index] = 0;
}

}

使用共享内存的

版本（理论上更快）：

__global__ void erode_multiple_img_SM_v2(unsigned char * buffer_in,
                            unsigned char * buffer_out,
                            int w,int h ){


// Data cache: threadIdx.x , threadIdx.y
__shared__ unsigned char data[TILE_W + STREL_SIZE-1 ][TILE_H + STREL_SIZE-1 ];

// coord over img
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

int plane = blockIdx.z * blockDim.z + threadIdx.z;

int plane_offset = plane*h*w;

int gLoc =plane_offset + row*w +col;



// each threads loads four values from global memory into shared mem
int x, y;   // image based coordinate



if((col<w)&&(row<h)) {

    //line 151
    data[threadIdx.x][threadIdx.y] = buffer_in[gLoc];

    if ( threadIdx.y > (TILE_H - STREL_SIZE))
          //line 157
          data[threadIdx.x][threadIdx.y + STREL_SIZE-1] = row + STREL_SIZE-1 < h ? buffer_in[plane_offset + (row + STREL_SIZE-1)*w + col] : 255;

    //read px from (16:19,0:15) --> the Lower part of tile (to do this i use thread (12:15,0:15)
    if (threadIdx.x > (TILE_W-STREL_SIZE))
        //if my bouds are in the image i add the patch of image that exceeds the TILE_WIDTH else i add ficticious pixels (that not exists
        // because i exceed the width of image
          data[threadIdx.x + STREL_SIZE-1][threadIdx.y] = col + STREL_SIZE-1 < w ? buffer_in[plane_offset + row*w+col + STREL_SIZE-1] : 255;

    //read px from (16:19,16:19) --> the Lower part of tile (to do this i use thread (12:15,12:15)
     if ((threadIdx.x > (TILE_W-STREL_SIZE)) && (threadIdx.y > (TILE_H-STREL_SIZE)))
         //if my bouds are in the image i add the patch of image that exceeds the TILE_WIDTH else i add ficticious pixels (that not exists
         // because i exceed the width of image
          data[threadIdx.x + STREL_SIZE-1][threadIdx.y + STREL_SIZE-1] = (row + STREL_SIZE-1<h && col + STREL_SIZE-1<w) ? buffer_in[plane_offset+(row + STREL_SIZE-1)*w + col + STREL_SIZE-1] : 255;

     //wait for all threads to finish read
     __syncthreads();



      unsigned char min_value = 255;
      for(x=0;x<STREL_SIZE;x++){
          for(y=0;y<STREL_SIZE;y++){
              //line 179
              min_value = min( (data[threadIdx.x+x][threadIdx.y+y]) , min_value);
              }

          }
      buffer_out[gLoc]= min_value;
      }



}

我还发现这个slides描述了银行冲突案件。

我也做了如下图所示的分析：

表示的行在代码中被注释

避免卷积中的银行冲突

0 个答案: