__global__ void setRowReadColPad(int *out)
{
// static shared memory
__shared__ int tile[32][33];
// mapping from thread index to global memory offset
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
// shared memory store operation
tile[threadIdx.y][threadIdx.x] = idx;
// wait for all threads to complete
__syncthreads();
// shared memory load operation
out[idx] = tile[threadIdx.x][threadIdx.y];
}
使用共享内存填充需要13.473us
不进行填充需要5.025us
有人可以解释一下为什么填充对内核运行时有如此大的影响吗?