当共享内存数组被一列填充时,为什么这个矩阵会更快地转换内核?
我在PyCuda/Examples/MatrixTranspose找到了内核。
来源
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
block_size = 16
def _get_transpose_kernel(offset):
mod = SourceModule("""
#define BLOCK_SIZE %(block_size)d
#define A_BLOCK_STRIDE (BLOCK_SIZE * a_width)
#define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height)
__global__ void transpose(float *A_t, float *A, int a_width, int a_height)
{
// Base indices in A and A_t
int base_idx_a = blockIdx.x * BLOCK_SIZE +
blockIdx.y * A_BLOCK_STRIDE;
int base_idx_a_t = blockIdx.y * BLOCK_SIZE +
blockIdx.x * A_T_BLOCK_STRIDE;
// Global indices in A and A_t
int glob_idx_a = base_idx_a + threadIdx.x + a_width * threadIdx.y;
int glob_idx_a_t = base_idx_a_t + threadIdx.x + a_height * threadIdx.y;
/** why does the +1 offset make the kernel faster? **/
__shared__ float A_shared[BLOCK_SIZE][BLOCK_SIZE+%(offset)d];
// Store transposed submatrix to shared memory
A_shared[threadIdx.y][threadIdx.x] = A[glob_idx_a];
__syncthreads();
// Write transposed submatrix to global memory
A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y];
}
"""% {"block_size": block_size, "offset": offset})
kernel = mod.get_function("transpose")
kernel.prepare("PPii", block=(block_size, block_size, 1))
return kernel
def transpose(tgt, src,offset):
krnl = _get_transpose_kernel(offset)
w, h = src.shape
assert tgt.shape == (h, w)
assert w % block_size == 0
assert h % block_size == 0
krnl.prepared_call((w / block_size, h /block_size), tgt.gpudata, src.gpudata, w, h)
def run_benchmark():
from pycuda.curandom import rand
print pycuda.autoinit.device.name()
print "time\tGB/s\tsize\toffset\t"
for offset in [0,1]:
for size in [2048,2112]:
source = rand((size, size), dtype=numpy.float32)
target = gpuarray.empty((size, size), dtype=source.dtype)
start = pycuda.driver.Event()
stop = pycuda.driver.Event()
warmup = 2
for i in range(warmup):
transpose(target, source,offset)
pycuda.driver.Context.synchronize()
start.record()
count = 10
for i in range(count):
transpose(target, source,offset)
stop.record()
stop.synchronize()
elapsed_seconds = stop.time_since(start)*1e-3
mem_bw = source.nbytes / elapsed_seconds * 2 * count /1024/1024/1024
print "%6.4fs\t%6.4f\t%i\t%i" %(elapsed_seconds,mem_bw,size,offset)
run_benchmark()
输出
Quadro FX 580
time GB/s size offset
0.0802s 3.8949 2048 0
0.0829s 4.0105 2112 0
0.0651s 4.7984 2048 1
0.0595s 5.5816 2112 1
采用的代码
答案 0 :(得分:3)
答案是共享内存库冲突。您正在使用的CUDA硬件将共享内存安排到16个库中,共享内存在所有这16个库中按顺序“条带化”。如果两个线程同时尝试访问同一个存储区,则会发生冲突并且必须序列化线程。这就是你在这里看到的。通过将共享内存数组的步幅扩展为1,可以确保共享数组的连续行中的相同列索引位于不同的存储区中,从而消除了大多数可能的冲突。
这种现象(以及一种称为分区驻留的相关全局记忆现象)在随着SDK矩阵转置示例附带的“在CUDA中优化矩阵转置”一文中进行了深入讨论。值得一读。