cuda并行简化#6起作用,而简化#7失败

时间:2018-07-28 15:18:57

标签: cuda gpu reduce

我使用此代码进行简化:

http://www.math.nsysu.edu.tw/~lam/MPI/code/cuda/reduction.cu

基于Mark Harris的谈话,如此处

http://www.math.nsysu.edu.tw/~lam/MPI/lecture/reduction.pdf

但是

TryGetValue

内核reduce6起作用,而reduce7失败。 bcos reduce7是否取决于大小必须达到上面定义的“大小”的共享内存量?

代码段在这里:

#define blocksize 1024
#define gridsize  1024*8
#define size blocksize*gridsize

这个内核从main像这样被调用:

#define THR_PER_BLC 1024
#define BLC_PER_GRD  16
#define GRID_SIZE THR_PER_BLC * BLC_PER_GRD

template<unsigned int nThreads>
__global__ void reduce7(int *g_idata, int *g_odata, unsigned int n) {
     //I added GRID_SIZE myself so it can be volatile
     __shared__ volatile  int sdata[THR_PER_BLC]; 
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * (nThreads * 2) + threadIdx.x;
    unsigned int gridSize = nThreads * 2 * gridDim.x;
    sdata[tid] = 0;
    while (i < n) {
        sdata[tid] += g_idata[i] + g_idata[i + nThreads];
        i += gridSize;
    }
    __syncthreads();
// reduction in shared memory
    if (nThreads >= 512) {
        if (tid < 256) { sdata[tid] += sdata[tid + 256]; }
        __syncthreads();
    }
    if (nThreads >= 256) {
        if (tid < 128) { sdata[tid] += sdata[tid + 128]; }
        __syncthreads();
    }
    if (nThreads >= 128) {
        if (tid < 64) { sdata[tid] += sdata[tid + 64]; }
        __syncthreads();
    }
    if (tid < 32) {
        if (nThreads >= 64) sdata[tid] += sdata[tid + 32];
        if (nThreads >= 32) sdata[tid] += sdata[tid + 16];
        if (nThreads >= 16) sdata[tid] += sdata[tid + 8];
        if (nThreads >= 8) sdata[tid] += sdata[tid + 4];
        if (nThreads >= 4) sdata[tid] += sdata[tid + 2];
        if (nThreads >= 2) sdata[tid] += sdata[tid + 1];
// transfer of the result to global memory
        if (tid == 0) g_odata[blockIdx.x] = sdata[0];
    }
}

从根本上讲,不能对大GRID_SIZE调用reduce7吗?

这是我的测试

threads = THR_PER_BLC /2 ;

int gsize = BLC_PER_GRD /8;

switch (threads) {
    case 512:
        reduce7<512> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 256:
        reduce7<256> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 128:
        reduce7<128> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 64:
        reduce7<64> << < gsize, threads  >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 32:
        reduce7<32> << < gsize, threads  >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 16:
        reduce7<16> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 8:
        reduce7<8> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 4:
        reduce7<4> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 2:
        reduce7<2> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
    case 1:
        reduce7<1> << < gsize, threads >> > (g_idata, g_odata, GRID_SIZE);
        break;
}
cudaThreadSynchronize();

还有我的gpu:

#################################################################
6 Unroll the complete loop
Kernal elapsed time =      0.030(ms)
Elapsed time =      0.057(ms)
Sum = 8192, with BLC_PER_GRD 16 THR_PER_BLC 512
#################################################################
7 Final
Kernal elapsed time =      0.015(ms), band =
Elapsed time =      0.040(ms)
Sum = 8192, with BLC_PER_GRD 16 THR_PER_BLC 512
#################################################################

#################################################################
6 Unroll the complete loop
Kernal elapsed time =      0.031(ms)
Elapsed time =      0.057(ms)
Sum = 8192, with BLC_PER_GRD 8 THR_PER_BLC 1024
#################################################################
7 Final
Kernal elapsed time =      0.015(ms), band =
Elapsed time =      0.040(ms)
Sum = 8192, with BLC_PER_GRD 8 THR_PER_BLC 1024
#################################################################

#################################################################
6 Unroll the complete loop
Kernal elapsed time =      0.569(ms)
Elapsed time =     12.889(ms)
Sum = 8388608, with BLC_PER_GRD 8192 THR_PER_BLC 1024
#################################################################

嗯,让我们先设置128个线程,网格大小为4:

a@M:/usr/local/cuda/samples/bin/x86_64/linux/release$ ./dev*Drv
./deviceQueryDrv Starting...

CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)

Device 0: "GeForce GTX 1060 6GB"
  CUDA Driver Version:                           9.2
  CUDA Capability Major/Minor version number:    6.1
  Total amount of global memory:                 6078 MBytes (6373572608 bytes)
  (10) Multiprocessors, (128) CUDA Cores/MP:     1280 CUDA Cores
  GPU Max Clock rate:                            1709 MHz (1.71 GHz)
  Memory Clock rate:                             4004 Mhz
  Memory Bus Width:                              192-bit
  L2 Cache Size:                                 1572864 bytes
  Max Texture Dimension Sizes                    1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size (x,y,z):    (2147483647, 65535, 65535)
  Texture alignment:                             512 bytes
  Maximum memory pitch:                          2147483647 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 3 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

然后reduce7起作用。因此,意味着reduce7严格取决于max shm?

编辑

似乎我对以下行感到困惑:#define MAX_SHM 49152 #define GRID_SIZE MAX_SHM / sizeof(int) #define THR_PER_BLC 128 #define BLC_PER_GRD GRID_SIZE/THR_PER_BLC ,其中n为GRID_SIZE。那么现在我不知道while (i < n) {是什么意思。需要消化一段时间。但很高兴知道,在一个块中只能有特定数量的线程,在这种情况下,我们必须与SM匹配。

1 个答案:

答案 0 :(得分:4)

首先,这种减少所需的共享内存仅与 block 的需求一样大,而不是 grid 的需求。因此,要求按网格大小分配共享内存是没有道理的。

第二,这要求每个块64KB静态分配的共享内存:

 __shared__ volatile  int sdata[GRID_SIZE]; 

那是行不通的,因为:

Total amount of shared memory per block:       49152 bytes

此外,此外,这是每个块要求64KB动态分配的共享内存:

 case 128:
    reduce7<128> << < gsize, threads, GRID_SIZE * sizeof(int) >> > (g_idata, g_odata, GRID_SIZE);
    break;

因此组合(64K + 64K)将永远无法工作。

您似乎对如何使用共享内存以及每个块需要多少内存感到困惑。该块每个线程仅需要一个数量(在这种情况下为int

您可能会对the syntax and usage of statically allocated shared memory vs. dynamically allocated shared memory感到困惑。对于这种类型的问题,通常会使用其中一个,而不是两者都使用。

我不知道这句话的意思:

 //I added GRID_SIZE myself so it can be volatile

通常的建议:每当您遇到CUDA代码问题时,您都应该做proper CUDA error checking,并用cuda-memcheck运行代码,之前向他人寻求帮助。即使开始的示例代码没有正确的CUDA错误检查,一旦开始进行修改并遇到麻烦,也应该添加

  

然后reduce7起作用。因此,意味着reduce7严格取决于max shm?

这意味着reduce7每个块需要一定数量的共享内存。该数量是每个线程一个int。这就是它所需要的。如果您提供更多,只要您不超过所能提供的最大值,就可以(一种)。如果您超过了可以提供的最大值,则整个内核启动将失败。

换句话说,您真正需要的是:

__shared__ volatile  int sdata[THR_PER_BLC];