Question

优化我的代码以使用接下来碰到的大部分CUDA卡。
即使每个信息源都表明使用2.x计算功能可以使网格的大小为（65535,65535,65535），但我无法使用大于（65535,8192,1）大小的网格。
示例代码显示，即使使用等于（1,1,1）的blockSize和空内核，当在大于上述大小的网格上运行时，也会导致错误“ code = 4（cudaErrorLaunchFailure）”。

操作系统：Win10Pro
硬件：GTS 450
SDK：CUDA 8.0，VS2013CE（通过nvcc -ccbin选项的路径使用）
测试代码：

#include <helper_cuda.h>

__global__ void KernelTest()
{}

int main()
{
    int cudaDevice=0;  
    int driverVersion = 0, runtimeVersion = 0;
    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf ("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        printf ("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf ("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    cudaSetDevice(cudaDevice);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, cudaDevice);

    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);
    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

    char msg[256];
    …
    //Code from deviceQuery
    …


const char *sComputeMode[] =
{
    "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
    "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
    "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
    "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
    "Unknown",
    NULL
};
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);


    //dim3 gridtest(deviceProp.maxGridSize[0]-1, deviceProp.maxGridSize[1]-1, deviceProp.maxGridSize[2]-1);
    dim3 gridtest(deviceProp.maxGridSize[0], 1, 1);
    dim3 blocktest(1);
    KernelTest<<<gridtest,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest2(deviceProp.maxGridSize[0]/2, 2, 1);
    KernelTest<<<gridtest2,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest3(deviceProp.maxGridSize[0]/4, 4, 1);
    KernelTest<<<gridtest3,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest4(deviceProp.maxGridSize[0], 2, 1);
    KernelTest<<<gridtest4,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest5(deviceProp.maxGridSize[0], 4, 1);
    KernelTest<<<gridtest5,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest6(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/16, 1);//4096
    KernelTest<<<gridtest6,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest7(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/8, 1);//8192
    KernelTest<<<gridtest7,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

    dim3 gridtest8(deviceProp.maxGridSize[0], (deviceProp.maxGridSize[1]+1)/4, 1);//16384 - Causes Error
    KernelTest<<<gridtest8,blocktest>>>();
    cudaDeviceSynchronize();
    checkCudaErrors(cudaPeekAtLastError (  ));

//    dim3 gridtest9(deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], 1);
//    KernelTest<<<gridtest9,blocktest>>>();
//    cudaDeviceSynchronize();
//    checkCudaErrors(cudaPeekAtLastError (  ));


    cudaDeviceReset() ;
}

deviceQuery部分的输出：

CUDA Driver Version / Runtime Version          9.1 / 8.0
  CUDA Capability Major/Minor version number:    2.1
  Total amount of global memory:                 1024 MBytes (1073741824 bytes)
  ( 4) Multiprocessors, ( 48) CUDA Cores/MP:     192 CUDA Cores
  GPU Max Clock rate:                            1566 MHz (1.57 GHz)
  Memory Clock rate:                             1804 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 262144 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (65535, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Answer 1

问题中的关键信息是：

CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)

因为您使用的是WDDM设备，所以显示驱动程序会限制内核消耗多少挂钟时间。如果超过此限制，驱动程序将杀死您的内核。

这就是这里发生的情况（cudaErrorLaunchFailure确认了这一点）。调度大量的块不是免费的，如果要调度许多块，那么即使是空内核也可能需要花费几秒钟的时间才能完成。在您的情况下，您正在使用的小型旧GPU加剧了这种情况，该GPU仅可同时运行32个块，这意味着当您在多个请求之间进行请求时，驱动程序会进行很多块调度行程以完成运行内核启动在单个内核启动中将运行一亿十亿个块。

作为参考，这是Linux系统上托管的非显示GPU上的探查器输出，该GPU的总常驻块数比GPU多得多（416对32）：

   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*           Device   Context    Stream  Name
235.86ms  139.29us          (65535 1 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [106]
236.03ms  138.49us          (32767 2 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [109]
236.19ms  138.46us          (16383 4 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [112]
236.35ms  275.58us          (65535 2 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [115]
236.65ms  550.09us          (65535 4 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [118]
237.22ms  504.49ms       (65535 4096 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [121]
741.79ms  924.72ms       (65535 8192 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [124]
1.66659s  1.84941s      (65535 16384 1)         (1 1 1)         8        0B        0B  GeForce GTX 970         1         7  KernelTest(void) [127]

您会看到65535 x 16384机箱需要1.8秒才能运行。在您的GPU上将更长。希望您还将由此得出结论，运行大量块不是一种优化，因为块调度的成本不是零。

CUDA。无法使用具有maxGridSizes

1 个答案: