为什么将共享内存数组填充一列会使内核的速度提高40%?

时间:2011-08-11 10:40:43

标签: gpgpu cuda

当共享内存数组被一列填充时,为什么这个矩阵会更快地转换内核?

我在PyCuda/Examples/MatrixTranspose找到了内核。

来源

import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy

block_size = 16    

def _get_transpose_kernel(offset):
    mod = SourceModule("""
    #define BLOCK_SIZE %(block_size)d
    #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
    #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)

    __global__ void transpose(float *A_t, float *A, int a_width, int a_height)
    {
        // Base indices in A and A_t
        int base_idx_a   = blockIdx.x * BLOCK_SIZE +
        blockIdx.y * A_BLOCK_STRIDE;
        int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
        blockIdx.x * A_T_BLOCK_STRIDE;

        // Global indices in A and A_t
        int glob_idx_a   = base_idx_a + threadIdx.x + a_width * threadIdx.y;
        int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;

        /** why does the +1 offset make the kernel faster? **/
        __shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+%(offset)d]; 

        // Store transposed submatrix to shared memory
        A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];

        __syncthreads();

        // Write transposed submatrix to global memory
        A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
    }
    """% {"block_size": block_size, "offset": offset})

    kernel = mod.get_function("transpose")
    kernel.prepare("PPii", block=(block_size, block_size, 1))
    return kernel 


def transpose(tgt, src,offset):
    krnl = _get_transpose_kernel(offset)
    w, h = src.shape
    assert tgt.shape == (h, w)
    assert w % block_size == 0
    assert h % block_size == 0
    krnl.prepared_call((w / block_size, h /block_size), tgt.gpudata, src.gpudata, w, h)


def run_benchmark():
    from pycuda.curandom import rand
    print pycuda.autoinit.device.name()
    print "time\tGB/s\tsize\toffset\t"
    for offset in [0,1]:
        for size in [2048,2112]:

            source = rand((size, size), dtype=numpy.float32)
            target = gpuarray.empty((size, size), dtype=source.dtype)

            start = pycuda.driver.Event()
            stop = pycuda.driver.Event()

            warmup = 2
            for i in range(warmup):
                transpose(target, source,offset)

            pycuda.driver.Context.synchronize()
            start.record()

            count = 10          
            for i in range(count):
                transpose(target, source,offset)

            stop.record()
            stop.synchronize()

            elapsed_seconds = stop.time_since(start)*1e-3
            mem_bw = source.nbytes / elapsed_seconds * 2 * count /1024/1024/1024

            print "%6.4fs\t%6.4f\t%i\t%i" %(elapsed_seconds,mem_bw,size,offset)


run_benchmark()

输出

Quadro FX 580
time    GB/s    size    offset  
0.0802s 3.8949  2048    0
0.0829s 4.0105  2112    0
0.0651s 4.7984  2048    1
0.0595s 5.5816  2112    1

采用的代码

1 个答案:

答案 0 :(得分:3)

答案是共享内存库冲突。您正在使用的CUDA硬件将共享内存安排到16个库中,共享内存在所有这16个库中按顺序“条带化”。如果两个线程同时尝试访问同一个存储区,则会发生冲突并且必须序列化线程。这就是你在这里看到的。通过将共享内存数组的步幅扩展为1,可以确保共享数组的连续行中的相同列索引位于不同的存储区中,从而消除了大多数可能的冲突。

这种现象(以及一种称为分区驻留的相关全局记忆现象)在随着SDK矩阵转置示例附带的“在CUDA中优化矩阵转置”一文中进行了深入讨论。值得一读。