Question

我正在使用共享内存在CUDA-C中实现平铺矩阵乘法算法，用于任意大小的矩阵。

对于某些矩阵大小，我会收到CUDA错误，而对于其他大小，一切都可以正常工作。

平铺宽度应具有适应性，但在开发过程中固定为32 。

例如，这些矩阵大小会导致 CUDA错误：

 1000 x    100   *      100 x   1000   =   1000 x   1000
 1000 x   1000   *     1000 x   1000   =   1000 x   1000
10000 x      1   *        1 x  10000   =  10000 x  10000
    1 x 100000   *   100000 x      1   =      1 x      1
  999 x    999   *      999 x    999   =    999 x    999
  512 x    512   *      512 x    512   =    512 x    512
 1024 x   1024   *     1024 x   1024   =   1024 x   1024

例如，这些矩阵大小正常工作：

  100 x    100   *      100 x    100   =    100 x    100
  200 x    100   *      100 x    200   =    200 x    200
  100 x   1000   *     1000 x    100   =    100 x    100
 1000 x     10   *       10 x   1000   =   1000 x   1000
 1000 x      1   *        1 x   1000   =   1000 x   1000
    1 x  10000   *    10000 x      1   =      1 x      1
   99 x     99   *       99 x     99   =     99 x     99
   32 x     32   *       32 x     32   =     32 x     32
   64 x     64   *       64 x     64   =     64 x     64
  128 x    128   *      128 x    128   =    128 x    128
  256 x    256   *      256 x    256   =    256 x    256

我正在 Windows 10 计算机和 GeForce 840M 图形卡上使用 MinGW64 。

我使用以下命令编译/启动了代码，其中N，K和M是矩阵大小。

nvcc -lineinfo -g -arch=sm_50 tiled_matrix_mult.cu
cuda-memcheck a.exe N K M

CUDA错误消息的结构始终相同。如果N = K = M = 1000矩阵大小，则会显示以下错误消息：

C:\Users\my\path>cuda-memcheck a.exe 1000 1000 1000
========= CUDA-MEMCHECK

Cuda error in line 168: unspecified launch failure
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuD3D9UnmapVertexBuffer + 0x2e1ea0) [0x2f01db]
=========     Host Frame:C:\Users\my\path\a.exe (cudaDeviceSynchronize + 0xf8) [0x3ad8]
=========     Host Frame:C:\Users\my\path\a.exe (main + 0x60c) [0x49aec]
=========     Host Frame:C:\Users\my\path\a.exe (__scrt_common_main_seh + 0x10c) [0x4bf78]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x13034]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x73691]
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuD3D9UnmapVertexBuffer + 0x1c03f3) [0x1ce72e]
=========     Host Frame:C:\Users\my\path\a.exe (cudart::module::unload + 0x115) [0x38725]
=========     Host Frame:C:\Users\my\path\a.exe (cudart::contextState::unloadAllModules + 0x1c6) [0x38d56]
=========     Host Frame:C:\Users\my\path\a.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0x3cd98]
=========     Host Frame:C:\Users\my\path\a.exe (cudart::globalState::~globalState + 0x3f) [0x2858f]
=========     Host Frame:C:\Users\my\path\a.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x2f8f6]
=========     Host Frame:C:\Users\my\path\a.exe (<lambda_f03950bc5685219e0bcd2087efbe011e>::operator() + 0xa4) [0x8494c]
=========     Host Frame:C:\Users\my\path\a.exe (__crt_seh_guarded_call<int>::operator()<<lambda_7777bce6b2f8c936911f934f8298dc43>,<lambda_f03950bc5685219e0bcd2087efbe011e> & __ptr64,<lambda_3883c3dff614d5e0c5f61bb1ac94921c> > + 0x25) [0x845a5]
=========     Host Frame:C:\Users\my\path\a.exe (_execute_onexit_table + 0x35) [0x84a6d]
=========     Host Frame:C:\Users\my\path\a.exe (<lambda_6e4b09c48022b2350581041d5f6b0c4c>::operator() + 0x84) [0x64bec]
=========     Host Frame:C:\Users\my\path\a.exe (__crt_seh_guarded_call<void>::operator()<<lambda_d80eeec6fff315bfe5c115232f3240e3>,<lambda_6e4b09c48022b2350581041d5f6b0c4c> & __ptr64,<lambda_2358e3775559c9db80273638284d5e45> > + 0x25) [0x64a95]
=========     Host Frame:C:\Users\my\path\a.exe (common_exit + 0xa3) [0x64d3b]
=========     Host Frame:C:\Users\my\path\a.exe (__scrt_common_main_seh + 0x173) [0x4bfdf]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x13034]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x73691]
=========
========= ERROR SUMMARY: 2 errors

我不知道是什么导致了这些错误。

但是ArrayOutOfBounds和OutOfMemory异常不是原因。这些会导致不同的错误消息（我之前不得不处理）。

您是否知道如何解决“未指定的启动失败” 错误？

这是源代码：

/***** IMPORTS *****/

#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <windows.h>

/***** STRUCT DEFINITIONS *****/

struct Timer
{
    LARGE_INTEGER start;
    LARGE_INTEGER end;
    LARGE_INTEGER elapsed_microseconds;
    LARGE_INTEGER frequency;
};

/***** DECLERATIONS *****/

// Multiplication.
__global__ void multiply_tiled_on_device_shared(float* A, float* B, float* C, int num_A_rows, int num_A_columns, int num_B_rows, int num_B_columns, int num_C_rows, int num_C_columns);
__global__ void multiply_tiled_on_device_non_shared(float* A, float* B, float* C, int num_A_rows, int num_A_columns, int num_B_rows, int num_B_columns, int num_C_rows, int num_C_columns);
void multiply_on_host(float* A, float* B, float* C, int num_A_rows, int num_A_columns, int num_B_rows, int num_B_columns, int num_C_rows, int num_C_columns);

// Utils for matrices
void create_matrix(float* m, int x, int y);
void check_result(float* host_computed_C, float* host_C, int x, int y, int device_type);

// Utils for timers.
void timer_start(Timer* timer);
void timer_end(Timer* timer);

/***** STATIC VARIABLES *****/

// Default matrix boundaries.
const int INPUT_MATRIX_A_SIZE_X = 3000;
const int INPUT_MATRIX_A_SIZE_Y = 1000;
const int INPUT_MATRIX_B_SIZE_X = 1000;
const int INPUT_MATRIX_B_SIZE_Y = 5000;

// Tile boundaries.
const int TILE_WIDTH = 32;

/***** MACROS *****/

#define CHECK_CUDA_ERROR(result)\
    do\
    {\
        cudaError_t error = result;\
        if (error != cudaSuccess)\
        {\
            printf("Cuda error in line %d: ", __LINE__);\
            printf("%s\n", cudaGetErrorString(error));\
            return -1;\
        }\
    }\
    while(0)

/***** MAIN *****/

int main(int argc, char ** argv)
{
    printf("\n");

    // Check command line arguments.
    if(argc != 1 && argc != 4){
        printf("Please provide 0 or 3 arguments.\n");
        printf("Arguments: M K N\n");
        printf(" - Matrix A of size M x K\n");
        printf(" - Matrix B of size K x N\n");
        printf(" - Matrix C=A*B of size M x N\n");
        exit(-1);
    }

    // Initialise matrix sizes.
    int num_A_rows;
    int num_A_columns;
    int num_B_rows;
    int num_B_columns;
    int num_C_rows;
    int num_C_columns;

    // Set matrix sizes.
    if(argc == 4)
    {
        num_A_rows    = atoi(*(argv+1));
        num_A_columns = atoi(*(argv+2));
        num_B_rows    = atoi(*(argv+2));
        num_B_columns = atoi(*(argv+3));
        num_C_rows    = num_A_rows;
        num_C_columns = num_B_columns;
    }
    else
    {
        num_A_rows    = INPUT_MATRIX_A_SIZE_X;
        num_A_columns = INPUT_MATRIX_A_SIZE_Y;
        num_B_rows    = INPUT_MATRIX_B_SIZE_X;
        num_B_columns = INPUT_MATRIX_B_SIZE_Y;
        num_C_rows    = num_A_rows;
        num_C_columns = num_B_columns;
    }

    // Allocate memory for host.
    float* host_A            = (float*) malloc(sizeof(float) * num_A_rows * num_A_columns);
    float* host_B            = (float*) malloc(sizeof(float) * num_B_rows * num_B_columns);
    float* host_C_shared     = (float*) malloc(sizeof(float) * num_C_rows * num_C_columns);
    float* host_C_non_shared = (float*) malloc(sizeof(float) * num_C_rows * num_C_columns);
    float* host_computed_C   = (float*) malloc(sizeof(float) * num_C_rows * num_C_columns);

    // Allocate memory for device.
    float* device_A;
    float* device_B;
    float* device_C_shared;
    float* device_C_non_shared;
    CHECK_CUDA_ERROR(cudaMalloc((void**)&device_A,            sizeof(float) * num_A_rows * num_A_columns));
    CHECK_CUDA_ERROR(cudaMalloc((void**)&device_B,            sizeof(float) * num_B_rows * num_B_columns));
    CHECK_CUDA_ERROR(cudaMalloc((void**)&device_C_shared,     sizeof(float) * num_C_rows * num_C_columns));
    CHECK_CUDA_ERROR(cudaMalloc((void**)&device_C_non_shared, sizeof(float) * num_C_rows * num_C_columns));

    // Create matrices A and B on host.
    create_matrix(host_A, num_A_rows, num_A_columns);
    create_matrix(host_B, num_B_rows, num_B_columns);

    // Copy matrices A and B to device.
    CHECK_CUDA_ERROR(cudaMemcpy(device_A, host_A, sizeof(float) * num_A_rows * num_A_columns, cudaMemcpyHostToDevice));
    CHECK_CUDA_ERROR(cudaMemcpy(device_B, host_B, sizeof(float) * num_B_rows * num_B_columns, cudaMemcpyHostToDevice));

    // Set dimensions for device.
    dim3 dimGrid((num_C_columns + TILE_WIDTH - 1) / TILE_WIDTH, (num_C_rows + TILE_WIDTH - 1) / TILE_WIDTH, 1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    // Initialise timer.
    Timer timer_host;
    Timer timer_device_shared;
    Timer timer_device_non_shared;

    // Warmup shared kernel.
    multiply_tiled_on_device_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
    CHECK_CUDA_ERROR(cudaPeekAtLastError());
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());

    // Warmup non-shared kernel.
    multiply_tiled_on_device_non_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_non_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
    CHECK_CUDA_ERROR(cudaPeekAtLastError());
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());

    // Start device timer for shared memory execution.
    timer_start(&timer_device_shared);

    // Execute tiled multiplication with shared memory on device.
    multiply_tiled_on_device_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
    CHECK_CUDA_ERROR(cudaPeekAtLastError());

    // Wait for the device to finish its tasks.
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());

    // End device timer for shared memory.
    timer_end(&timer_device_shared);

    // Copy result matrix C to host.
    CHECK_CUDA_ERROR(cudaMemcpy(host_C_shared, device_C_shared, sizeof(float) * num_C_rows * num_C_columns, cudaMemcpyDeviceToHost));

    // Start device timer for non-shared memory execution.
    timer_start(&timer_device_non_shared);

    // Execute tiled multiplication with non-shared memory on device.
    multiply_tiled_on_device_non_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_non_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
    CHECK_CUDA_ERROR(cudaPeekAtLastError());

    // Wait for the device to finish its tasks.
    CHECK_CUDA_ERROR(cudaDeviceSynchronize());

    // End device timer for non-shared memory.
    timer_end(&timer_device_non_shared);

    // Copy result matrix C to host.
    CHECK_CUDA_ERROR(cudaMemcpy(host_C_non_shared, device_C_non_shared, sizeof(float) * num_C_rows * num_C_columns, cudaMemcpyDeviceToHost));

    // Start host timer.
    timer_start(&timer_host);

    // Execute multiplication on host.
    multiply_on_host(host_A, host_B, host_computed_C, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);

    // End host timer.
    timer_end(&timer_host);

    // Compare results of host and device.
    check_result(host_computed_C, host_C_shared,     num_C_rows, num_C_columns, 0);
    check_result(host_computed_C, host_C_non_shared, num_C_rows, num_C_columns, 1);
    printf("\n");

    // Perform statistical calculations.
    float perc_device_shared_to_host              = (float) timer_device_shared.elapsed_microseconds.QuadPart     / timer_host.elapsed_microseconds.QuadPart              * 100;
    float perc_device_non_shared_to_host          = (float) timer_device_non_shared.elapsed_microseconds.QuadPart / timer_host.elapsed_microseconds.QuadPart              * 100;
    float perc_device_shared_to_device_non_shared = (float) timer_device_shared.elapsed_microseconds.QuadPart     / timer_device_non_shared.elapsed_microseconds.QuadPart * 100;

    // Print sizes, dimensions, times, etc.
    printf("Matrix A:                                   %d x %d\n", num_A_rows, num_A_columns);
    printf("Matrix B:                                   %d x %d\n", num_B_rows, num_B_columns);
    printf("Matrix C:                                   %d x %d\n", num_C_rows, num_C_columns);
    printf("\n");
    printf("Tile:                                       %d\n", TILE_WIDTH);
    printf("\n");
    printf("Grid:                                       %d x %d x %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
    printf("Block:                                      %d x %d x %d\n", dimBlock.x, dimBlock.y, dimBlock.z);
    printf("# Threads:                                  %d x %d x %d\n", dimGrid.x*dimBlock.x, dimGrid.y*dimBlock.y, dimGrid.z*dimBlock.z);
    printf("\n");
    printf("Host time:                                  %lld mu-s\n", timer_host.elapsed_microseconds.QuadPart);
    printf("Device (non-shared) time:                   %lld mu-s\n", timer_device_non_shared.elapsed_microseconds.QuadPart);
    printf("Device (shared) time:                       %lld mu-s\n", timer_device_shared.elapsed_microseconds.QuadPart);
    printf("\n");
    printf("Device (non-shared) vs host:                %f %%\n", perc_device_non_shared_to_host);
    printf("Device (shared)     vs host:                %f %%\n", perc_device_shared_to_host);
    printf("Device (shared)     vs device (non-shared): %f %%\n", perc_device_shared_to_device_non_shared);

    // Free memory on device.
    CHECK_CUDA_ERROR(cudaFree(device_A));
    CHECK_CUDA_ERROR(cudaFree(device_B));
    CHECK_CUDA_ERROR(cudaFree(device_C_shared));
    CHECK_CUDA_ERROR(cudaFree(device_C_non_shared));

    // Free memory on host.
    free(host_A);
    free(host_B);
    free(host_C_shared);
    free(host_C_non_shared);
    free(host_computed_C);

    // Return.
    CHECK_CUDA_ERROR(cudaDeviceReset());
    return 0;
}

/***** TILED & SHARED MULTIPLICATION ON DEVICE *****/

__global__ void multiply_tiled_on_device_shared(float* A, float* B, float* C,
                                                int num_A_rows, int num_A_columns,
                                                int num_B_rows, int num_B_columns,
                                                int num_C_rows, int num_C_columns)
{
    // Allocate shared memory for tiles of matrices A and B.
    __shared__ float A_tile[TILE_WIDTH][TILE_WIDTH];
    __shared__ float B_tile[TILE_WIDTH][TILE_WIDTH];

    // Set thread's coordinates.
    int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
    int col = blockIdx.x * TILE_WIDTH + threadIdx.x;

    // Variable to store temporary results for current tiles.
    float C_temp = 0.0;

    // Iterate over each tile.
    for(int i=0; i<ceil((float) num_A_columns/TILE_WIDTH); i++)
    {

        // Copy data of matrix A from global memory to shared memory.
        if(row < num_C_rows && i * TILE_WIDTH + threadIdx.x < num_A_columns)
        {
            A_tile[threadIdx.y][threadIdx.x] = A[row*num_A_columns + i*TILE_WIDTH + threadIdx.x];
        }
        else
        {
            A_tile[threadIdx.y][threadIdx.x] = 0.0;
        }

        // Copy data of matrix B from global memory to shared memory.
        if(col < num_C_columns && i * TILE_WIDTH + threadIdx.y < num_B_rows)
        {
            B_tile[threadIdx.y][threadIdx.x] = B[(i*TILE_WIDTH + threadIdx.y)*num_C_columns + col];
        }
        else
        {
            B_tile[threadIdx.y][threadIdx.x] = 0.0;
        }

        // Wait for all threads in block to finish.
        __syncthreads();

        // Perform matrix multiplication for current tile.
        for(int j=0; j<TILE_WIDTH && j<num_A_columns; j++)
        {
            C_temp += A_tile[threadIdx.y][j] * B_tile[j][threadIdx.x];
        }

         // Wait for all threads in block to finish.
        __syncthreads();
    }

    // Set result in matrix C in global memory.
    if(row<num_C_rows && col<num_C_columns)
    {
        C[row*num_C_columns + col] = C_temp;
    }
}

/***** TILED & NON-SHARED MULTIPLICATION ON DEVICE *****/

__global__ void multiply_tiled_on_device_non_shared(float* A, float* B, float* C,
                                                    int num_A_rows, int num_A_columns,
                                                    int num_B_rows, int num_B_columns,
                                                    int num_C_rows, int num_C_columns)
{

    // Set thread's coordinates.
    int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
    int col = blockIdx.x * TILE_WIDTH + threadIdx.x;

    // Variable to store temporary results for current tiles.
    float C_temp = 0.0;

    // Iterate over each tile.
    for(int i=0; i<ceil((float) num_A_columns/TILE_WIDTH); i++)
    {

        // Perform matrix multiplication for current tile.
        for(int j=0; j<TILE_WIDTH && j<num_A_columns; j++)
        {
            if(row < num_C_rows &&
               i * TILE_WIDTH + j < num_A_columns &&
               col < num_C_columns &&
               i * TILE_WIDTH + j < num_B_rows)
                C_temp += A[row*num_A_columns + i*TILE_WIDTH + j] * B[(i*TILE_WIDTH + j)*num_C_columns + col];
        }
    }

    // Set result in matrix C in global memory.
    if(row<num_C_rows && col<num_C_columns)
    {
        C[row*num_C_columns + col] = C_temp;
    }
}

/***** MULTIPLICATION ON HOST *****/

void multiply_on_host(float* A, float* B, float* C,
                      int num_A_rows, int num_A_columns,
                      int num_B_rows, int num_B_columns,
                      int num_C_rows, int num_C_columns)
{

    // Iterate over rows.
    for(int i=0; i<num_C_rows; i++)
    {

        // Iterate over columns.
        for(int j=0; j<num_C_columns; j++)
        {

            // Compute one element in result array.
            C[i*num_C_columns + j] = 0.0;
            for(int k=0; k<num_A_columns; k++)
            {
                C[i*num_C_columns + j] += A[i*num_A_columns + k] * B[k*num_B_columns + j];
            }
        }
    }
}

/***** UTILS *****/

// Creates an array (matrix) with x*y float entries with each entry between "min" and "max".
// PRNG is not secure. Indeed, two independent executions of this programme produce the same set of random numbers, because the PRNG's salt is static.
void create_matrix(float* m, int x, int y)
{

    // Define minimal and maximal values.
    float min = 1.0f;
    float max = 9.9f;
    float range = max - min;

    // Fill the given matrix.
    float r;
    float div;
    for(int i=0; i<x; i++)
    {
        for(int j=0; j<y; j++)
        {
            div = RAND_MAX / range;
            r = min + (rand() / div);
            *(m + j*x + i) = r;
        }
    }
}

// Compares two float arrays.
void check_result(float* host_computed_C, float* host_C, int x, int y, int device_type)
{

    // Set comparison precision.
    double epsilon = 1.0E-1;

    // Boolean to toggle if result arrays to not match
    bool ok = true;

    // Iterate over rows.
    for(int i=0; i<x; i++)
    {

        // Iterate over columns.
        for(int j=0; j<y; j++)
        {
            if( (*(host_computed_C + i*y + j) - *(host_C + i*y + j)) > epsilon )
            {
                printf("HOST[%d][%d] = %f != %f = DEVICE[%d][%d]\n", i, j, *(host_computed_C + i*y + j), *(host_C + i*y + j), i, j);
                ok = false;
            }
        }
    }
    if(device_type == 0)
    {
        printf("Shared memory:                              ");
    }
    else
    {
        printf("Non-shared memory:                          ");
    }
    if(ok)
    {
        printf("Result arrays ok.\n");
    }
    else
    {
        printf("Result arrays do NOT match!\n");
    }
}

// Start a timer.
void timer_start(Timer* timer)
{
    QueryPerformanceFrequency(&(timer->frequency));
    QueryPerformanceCounter(&(timer->start));
}

// End a timer.
void timer_end(Timer* timer)
{
    QueryPerformanceCounter(&(timer->end));
    timer->elapsed_microseconds.QuadPart = timer->end.QuadPart - timer->start.QuadPart;
    timer->elapsed_microseconds.QuadPart *= 1000000;
    timer->elapsed_microseconds.QuadPart /= timer->frequency.QuadPart;
}

如何使用某些矩阵大小和图块大小的共享内存修复图块矩阵乘法中的CUDA错误“未指定的启动失败”？

0 个答案: