Question

我尝试使用共享内存来加速我的内核。使用全局内存的原始版本如下所示：

__global__ void my_kernel(float* inout, float* in, float* const_array)
{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;

    if (y < gpu_ny && x < gpu_nx) {
        for (int z = 0; z < gpu_nz; ++z) {
            int index = x + y * gpu_nx + gpu_nxy * z;
            inout[index] = const_array[index] * (
                C0 * in[index] +
                C1 * (in[index - gpu_nx] + in[index + gpu_nx]) +
                C2 * (in[index - 2 * gpu_nx] + in[index + 2 * gpu_nx]) +
                C3 * (in[index - 3 * gpu_nx] + in[index + 3 * gpu_nx]) +
                C4 * (in[index - 4 * gpu_nx] + in[index + 4 * gpu_nx]) +
                C5 * (in[index - 1] + in[index + 1]) +
                C6 * (in[index - 2] + in[index + 2]) +
                C7 * (in[index - 3] + in[index + 3]) +
                C8 * (in[index - 4] + in[index + 4]) +
                C9 * (in[index - 1 * gpu_nxy] + in[index + 1 * gpu_nxy]) +
                C10 * (in[index - 2 * gpu_nxy] + in[index + 2 * gpu_nxy]) +
                C11 * (in[index - 3 * gpu_nxy] + in[index + 3 * gpu_nxy]) +
                C12 * (in[index - 4 * gpu_nxy] + in[index + 4 * gpu_nxy])
            ) + in[index] + in[index] - inout[index];
        }
    }
}

此类问题的典型方法是将全局值加载到共享内存中，并在线程之间重用它们。

__global__ void my_kernel(float* inout, float* in, float* const_array)
{
    __shared__ float in_shared[BLOCK_NY + 2 * RADIUS][BLOCK_NX + 2 * RADIUS];

    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = blockIdx.x * blockDim.x + threadIdx.x;

    int sx = threadIdx.x + RADIUS;
    int sy = threadIdx.y + RADIUS;

    for (int z = 0; z < gpu_nz; ++z) {
        bool in_bounds = y < gpu_ny && x < gpu_nx;
        int index = x + y * gpu_nx + z * gpu_nxy;
        if (in_bounds) {
            in_shared[sy][sx] = in[index];
            /*if (threadIdx.x < RADIUS) {
                in_shared[sy][threadIdx.x] = in[index - RADIUS];
                in_shared[sy][sx + BLOCK_SIZE] = in[index + BLOCKX_SIZE];
            }
            if (threadIdx.y < RADIUS) {
                in_shared[threadIdx.y][sx] = in[index - gpu_nx * RADIUS];
                in_shared[sy + BLOCK_SIZE][sx] = in[index + gpu_nx * BLOCKY_SIZE];
            }*/
        }

        // __syncthreads();

        if (in_bounds) {
            inout[index] = const_array[index] * (
                C0 * in_shared[sy][sx] +
                C1 * (in_shared[sy-1][sx] + in_shared[sy+1][sx]) +
                C2 * (in_shared[sy-2][sx] + in_shared[sy+2][sx]) +
                C3 * (in_shared[sy-3][sx] + in_shared[sy+3][sx]) +
                C4 * (in_shared[sy-4][sx] + in_shared[sy+4][sx]) +
                C5 * (in_shared[sy][sx-1] + in_shared[sy][sx+1]) +
                C6 * (in_shared[sy][sx-2] + in_shared[sy][sx+2]) +
                C7 * (in_shared[sy][sx-3] + in_shared[sy][sx+3]) +
                C8 * (in_shared[sy][sx-4] + in_shared[sy][sx+4]) +
                C9 * (in[index - 1 * gpu_nxy] + in[index + 1 * gpu_nxy]) +
                C10 * (in[index - 2 * gpu_nxy] + in[index + 2 * gpu_nxy]) +
                C11 * (in[index - 3 * gpu_nxy] + in[index + 3 * gpu_nxy]) +
                C12 * (in[index - 4 * gpu_nxy] + in[index + 4 * gpu_nxy])
            ) + in_shared[sy][sx] + in_shared[sy][sx] - inout[index];
        }
    }
}

与预期相反，这使得我的性能低于仅使用全局内存的内核（大约慢4倍）。我试图调查原因。

首先，我注释掉了读取边界值的代码和__syncthreads()。我知道这使我的代码不正确，但我想评估性能影响。不过，当我使用共享内存时，性能要差得多。

我知道这里不存在银行冲突。首先，warp中的所有32个线程访问连续浮动，这意味着每个线程将由不同的内存库处理。此外，我运行了nvprof来查找shared_ld_bank_conflict和shared_st_bank_conflict个事件，但未发现这些事件。

数据重用率很高，不正确的版本应该要求全局读取少9倍。我甚至尝试了不同的方法，例如使用共享内存在z方向上重用数据。然而，内核运行速度明显变慢。我做错了什么？

CUDA共享内存比全局慢，即使没有库冲突且数据重用率很高

0 个答案: