我正在使用共享内存在CUDA-C中实现平铺矩阵乘法算法,用于任意大小的矩阵。
对于某些矩阵大小,我会收到CUDA错误,而对于其他大小,一切都可以正常工作。
平铺宽度应具有适应性,但在开发过程中固定为32 。
例如,这些矩阵大小会导致 CUDA错误:
1000 x 100 * 100 x 1000 = 1000 x 1000
1000 x 1000 * 1000 x 1000 = 1000 x 1000
10000 x 1 * 1 x 10000 = 10000 x 10000
1 x 100000 * 100000 x 1 = 1 x 1
999 x 999 * 999 x 999 = 999 x 999
512 x 512 * 512 x 512 = 512 x 512
1024 x 1024 * 1024 x 1024 = 1024 x 1024
例如,这些矩阵大小正常工作:
100 x 100 * 100 x 100 = 100 x 100
200 x 100 * 100 x 200 = 200 x 200
100 x 1000 * 1000 x 100 = 100 x 100
1000 x 10 * 10 x 1000 = 1000 x 1000
1000 x 1 * 1 x 1000 = 1000 x 1000
1 x 10000 * 10000 x 1 = 1 x 1
99 x 99 * 99 x 99 = 99 x 99
32 x 32 * 32 x 32 = 32 x 32
64 x 64 * 64 x 64 = 64 x 64
128 x 128 * 128 x 128 = 128 x 128
256 x 256 * 256 x 256 = 256 x 256
我正在 Windows 10 计算机和 GeForce 840M 图形卡上使用 MinGW64 。
我使用以下命令编译/启动了代码,其中N,K和M是矩阵大小。
nvcc -lineinfo -g -arch=sm_50 tiled_matrix_mult.cu
cuda-memcheck a.exe N K M
CUDA错误消息的结构始终相同。如果N = K = M = 1000矩阵大小,则会显示以下错误消息:
C:\Users\my\path>cuda-memcheck a.exe 1000 1000 1000
========= CUDA-MEMCHECK
Cuda error in line 168: unspecified launch failure
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuD3D9UnmapVertexBuffer + 0x2e1ea0) [0x2f01db]
========= Host Frame:C:\Users\my\path\a.exe (cudaDeviceSynchronize + 0xf8) [0x3ad8]
========= Host Frame:C:\Users\my\path\a.exe (main + 0x60c) [0x49aec]
========= Host Frame:C:\Users\my\path\a.exe (__scrt_common_main_seh + 0x10c) [0x4bf78]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x13034]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x73691]
=========
========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuD3D9UnmapVertexBuffer + 0x1c03f3) [0x1ce72e]
========= Host Frame:C:\Users\my\path\a.exe (cudart::module::unload + 0x115) [0x38725]
========= Host Frame:C:\Users\my\path\a.exe (cudart::contextState::unloadAllModules + 0x1c6) [0x38d56]
========= Host Frame:C:\Users\my\path\a.exe (cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload + 0x78) [0x3cd98]
========= Host Frame:C:\Users\my\path\a.exe (cudart::globalState::~globalState + 0x3f) [0x2858f]
========= Host Frame:C:\Users\my\path\a.exe (cudart::set<cudart::globalModule * __ptr64>::rehash + 0x106) [0x2f8f6]
========= Host Frame:C:\Users\my\path\a.exe (<lambda_f03950bc5685219e0bcd2087efbe011e>::operator() + 0xa4) [0x8494c]
========= Host Frame:C:\Users\my\path\a.exe (__crt_seh_guarded_call<int>::operator()<<lambda_7777bce6b2f8c936911f934f8298dc43>,<lambda_f03950bc5685219e0bcd2087efbe011e> & __ptr64,<lambda_3883c3dff614d5e0c5f61bb1ac94921c> > + 0x25) [0x845a5]
========= Host Frame:C:\Users\my\path\a.exe (_execute_onexit_table + 0x35) [0x84a6d]
========= Host Frame:C:\Users\my\path\a.exe (<lambda_6e4b09c48022b2350581041d5f6b0c4c>::operator() + 0x84) [0x64bec]
========= Host Frame:C:\Users\my\path\a.exe (__crt_seh_guarded_call<void>::operator()<<lambda_d80eeec6fff315bfe5c115232f3240e3>,<lambda_6e4b09c48022b2350581041d5f6b0c4c> & __ptr64,<lambda_2358e3775559c9db80273638284d5e45> > + 0x25) [0x64a95]
========= Host Frame:C:\Users\my\path\a.exe (common_exit + 0xa3) [0x64d3b]
========= Host Frame:C:\Users\my\path\a.exe (__scrt_common_main_seh + 0x173) [0x4bfdf]
========= Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x13034]
========= Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x73691]
=========
========= ERROR SUMMARY: 2 errors
我不知道是什么导致了这些错误。
但是ArrayOutOfBounds和OutOfMemory异常不是原因。这些会导致不同的错误消息(我之前不得不处理)。
您是否知道如何解决“未指定的启动失败” 错误?
这是源代码:
/***** IMPORTS *****/
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <windows.h>
/***** STRUCT DEFINITIONS *****/
struct Timer
{
LARGE_INTEGER start;
LARGE_INTEGER end;
LARGE_INTEGER elapsed_microseconds;
LARGE_INTEGER frequency;
};
/***** DECLERATIONS *****/
// Multiplication.
__global__ void multiply_tiled_on_device_shared(float* A, float* B, float* C, int num_A_rows, int num_A_columns, int num_B_rows, int num_B_columns, int num_C_rows, int num_C_columns);
__global__ void multiply_tiled_on_device_non_shared(float* A, float* B, float* C, int num_A_rows, int num_A_columns, int num_B_rows, int num_B_columns, int num_C_rows, int num_C_columns);
void multiply_on_host(float* A, float* B, float* C, int num_A_rows, int num_A_columns, int num_B_rows, int num_B_columns, int num_C_rows, int num_C_columns);
// Utils for matrices
void create_matrix(float* m, int x, int y);
void check_result(float* host_computed_C, float* host_C, int x, int y, int device_type);
// Utils for timers.
void timer_start(Timer* timer);
void timer_end(Timer* timer);
/***** STATIC VARIABLES *****/
// Default matrix boundaries.
const int INPUT_MATRIX_A_SIZE_X = 3000;
const int INPUT_MATRIX_A_SIZE_Y = 1000;
const int INPUT_MATRIX_B_SIZE_X = 1000;
const int INPUT_MATRIX_B_SIZE_Y = 5000;
// Tile boundaries.
const int TILE_WIDTH = 32;
/***** MACROS *****/
#define CHECK_CUDA_ERROR(result)\
do\
{\
cudaError_t error = result;\
if (error != cudaSuccess)\
{\
printf("Cuda error in line %d: ", __LINE__);\
printf("%s\n", cudaGetErrorString(error));\
return -1;\
}\
}\
while(0)
/***** MAIN *****/
int main(int argc, char ** argv)
{
printf("\n");
// Check command line arguments.
if(argc != 1 && argc != 4){
printf("Please provide 0 or 3 arguments.\n");
printf("Arguments: M K N\n");
printf(" - Matrix A of size M x K\n");
printf(" - Matrix B of size K x N\n");
printf(" - Matrix C=A*B of size M x N\n");
exit(-1);
}
// Initialise matrix sizes.
int num_A_rows;
int num_A_columns;
int num_B_rows;
int num_B_columns;
int num_C_rows;
int num_C_columns;
// Set matrix sizes.
if(argc == 4)
{
num_A_rows = atoi(*(argv+1));
num_A_columns = atoi(*(argv+2));
num_B_rows = atoi(*(argv+2));
num_B_columns = atoi(*(argv+3));
num_C_rows = num_A_rows;
num_C_columns = num_B_columns;
}
else
{
num_A_rows = INPUT_MATRIX_A_SIZE_X;
num_A_columns = INPUT_MATRIX_A_SIZE_Y;
num_B_rows = INPUT_MATRIX_B_SIZE_X;
num_B_columns = INPUT_MATRIX_B_SIZE_Y;
num_C_rows = num_A_rows;
num_C_columns = num_B_columns;
}
// Allocate memory for host.
float* host_A = (float*) malloc(sizeof(float) * num_A_rows * num_A_columns);
float* host_B = (float*) malloc(sizeof(float) * num_B_rows * num_B_columns);
float* host_C_shared = (float*) malloc(sizeof(float) * num_C_rows * num_C_columns);
float* host_C_non_shared = (float*) malloc(sizeof(float) * num_C_rows * num_C_columns);
float* host_computed_C = (float*) malloc(sizeof(float) * num_C_rows * num_C_columns);
// Allocate memory for device.
float* device_A;
float* device_B;
float* device_C_shared;
float* device_C_non_shared;
CHECK_CUDA_ERROR(cudaMalloc((void**)&device_A, sizeof(float) * num_A_rows * num_A_columns));
CHECK_CUDA_ERROR(cudaMalloc((void**)&device_B, sizeof(float) * num_B_rows * num_B_columns));
CHECK_CUDA_ERROR(cudaMalloc((void**)&device_C_shared, sizeof(float) * num_C_rows * num_C_columns));
CHECK_CUDA_ERROR(cudaMalloc((void**)&device_C_non_shared, sizeof(float) * num_C_rows * num_C_columns));
// Create matrices A and B on host.
create_matrix(host_A, num_A_rows, num_A_columns);
create_matrix(host_B, num_B_rows, num_B_columns);
// Copy matrices A and B to device.
CHECK_CUDA_ERROR(cudaMemcpy(device_A, host_A, sizeof(float) * num_A_rows * num_A_columns, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(device_B, host_B, sizeof(float) * num_B_rows * num_B_columns, cudaMemcpyHostToDevice));
// Set dimensions for device.
dim3 dimGrid((num_C_columns + TILE_WIDTH - 1) / TILE_WIDTH, (num_C_rows + TILE_WIDTH - 1) / TILE_WIDTH, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
// Initialise timer.
Timer timer_host;
Timer timer_device_shared;
Timer timer_device_non_shared;
// Warmup shared kernel.
multiply_tiled_on_device_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
CHECK_CUDA_ERROR(cudaPeekAtLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
// Warmup non-shared kernel.
multiply_tiled_on_device_non_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_non_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
CHECK_CUDA_ERROR(cudaPeekAtLastError());
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
// Start device timer for shared memory execution.
timer_start(&timer_device_shared);
// Execute tiled multiplication with shared memory on device.
multiply_tiled_on_device_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
CHECK_CUDA_ERROR(cudaPeekAtLastError());
// Wait for the device to finish its tasks.
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
// End device timer for shared memory.
timer_end(&timer_device_shared);
// Copy result matrix C to host.
CHECK_CUDA_ERROR(cudaMemcpy(host_C_shared, device_C_shared, sizeof(float) * num_C_rows * num_C_columns, cudaMemcpyDeviceToHost));
// Start device timer for non-shared memory execution.
timer_start(&timer_device_non_shared);
// Execute tiled multiplication with non-shared memory on device.
multiply_tiled_on_device_non_shared<<<dimGrid, dimBlock>>>(device_A, device_B, device_C_non_shared, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
CHECK_CUDA_ERROR(cudaPeekAtLastError());
// Wait for the device to finish its tasks.
CHECK_CUDA_ERROR(cudaDeviceSynchronize());
// End device timer for non-shared memory.
timer_end(&timer_device_non_shared);
// Copy result matrix C to host.
CHECK_CUDA_ERROR(cudaMemcpy(host_C_non_shared, device_C_non_shared, sizeof(float) * num_C_rows * num_C_columns, cudaMemcpyDeviceToHost));
// Start host timer.
timer_start(&timer_host);
// Execute multiplication on host.
multiply_on_host(host_A, host_B, host_computed_C, num_A_rows, num_A_columns, num_B_rows, num_B_columns, num_C_rows, num_C_columns);
// End host timer.
timer_end(&timer_host);
// Compare results of host and device.
check_result(host_computed_C, host_C_shared, num_C_rows, num_C_columns, 0);
check_result(host_computed_C, host_C_non_shared, num_C_rows, num_C_columns, 1);
printf("\n");
// Perform statistical calculations.
float perc_device_shared_to_host = (float) timer_device_shared.elapsed_microseconds.QuadPart / timer_host.elapsed_microseconds.QuadPart * 100;
float perc_device_non_shared_to_host = (float) timer_device_non_shared.elapsed_microseconds.QuadPart / timer_host.elapsed_microseconds.QuadPart * 100;
float perc_device_shared_to_device_non_shared = (float) timer_device_shared.elapsed_microseconds.QuadPart / timer_device_non_shared.elapsed_microseconds.QuadPart * 100;
// Print sizes, dimensions, times, etc.
printf("Matrix A: %d x %d\n", num_A_rows, num_A_columns);
printf("Matrix B: %d x %d\n", num_B_rows, num_B_columns);
printf("Matrix C: %d x %d\n", num_C_rows, num_C_columns);
printf("\n");
printf("Tile: %d\n", TILE_WIDTH);
printf("\n");
printf("Grid: %d x %d x %d\n", dimGrid.x, dimGrid.y, dimGrid.z);
printf("Block: %d x %d x %d\n", dimBlock.x, dimBlock.y, dimBlock.z);
printf("# Threads: %d x %d x %d\n", dimGrid.x*dimBlock.x, dimGrid.y*dimBlock.y, dimGrid.z*dimBlock.z);
printf("\n");
printf("Host time: %lld mu-s\n", timer_host.elapsed_microseconds.QuadPart);
printf("Device (non-shared) time: %lld mu-s\n", timer_device_non_shared.elapsed_microseconds.QuadPart);
printf("Device (shared) time: %lld mu-s\n", timer_device_shared.elapsed_microseconds.QuadPart);
printf("\n");
printf("Device (non-shared) vs host: %f %%\n", perc_device_non_shared_to_host);
printf("Device (shared) vs host: %f %%\n", perc_device_shared_to_host);
printf("Device (shared) vs device (non-shared): %f %%\n", perc_device_shared_to_device_non_shared);
// Free memory on device.
CHECK_CUDA_ERROR(cudaFree(device_A));
CHECK_CUDA_ERROR(cudaFree(device_B));
CHECK_CUDA_ERROR(cudaFree(device_C_shared));
CHECK_CUDA_ERROR(cudaFree(device_C_non_shared));
// Free memory on host.
free(host_A);
free(host_B);
free(host_C_shared);
free(host_C_non_shared);
free(host_computed_C);
// Return.
CHECK_CUDA_ERROR(cudaDeviceReset());
return 0;
}
/***** TILED & SHARED MULTIPLICATION ON DEVICE *****/
__global__ void multiply_tiled_on_device_shared(float* A, float* B, float* C,
int num_A_rows, int num_A_columns,
int num_B_rows, int num_B_columns,
int num_C_rows, int num_C_columns)
{
// Allocate shared memory for tiles of matrices A and B.
__shared__ float A_tile[TILE_WIDTH][TILE_WIDTH];
__shared__ float B_tile[TILE_WIDTH][TILE_WIDTH];
// Set thread's coordinates.
int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
// Variable to store temporary results for current tiles.
float C_temp = 0.0;
// Iterate over each tile.
for(int i=0; i<ceil((float) num_A_columns/TILE_WIDTH); i++)
{
// Copy data of matrix A from global memory to shared memory.
if(row < num_C_rows && i * TILE_WIDTH + threadIdx.x < num_A_columns)
{
A_tile[threadIdx.y][threadIdx.x] = A[row*num_A_columns + i*TILE_WIDTH + threadIdx.x];
}
else
{
A_tile[threadIdx.y][threadIdx.x] = 0.0;
}
// Copy data of matrix B from global memory to shared memory.
if(col < num_C_columns && i * TILE_WIDTH + threadIdx.y < num_B_rows)
{
B_tile[threadIdx.y][threadIdx.x] = B[(i*TILE_WIDTH + threadIdx.y)*num_C_columns + col];
}
else
{
B_tile[threadIdx.y][threadIdx.x] = 0.0;
}
// Wait for all threads in block to finish.
__syncthreads();
// Perform matrix multiplication for current tile.
for(int j=0; j<TILE_WIDTH && j<num_A_columns; j++)
{
C_temp += A_tile[threadIdx.y][j] * B_tile[j][threadIdx.x];
}
// Wait for all threads in block to finish.
__syncthreads();
}
// Set result in matrix C in global memory.
if(row<num_C_rows && col<num_C_columns)
{
C[row*num_C_columns + col] = C_temp;
}
}
/***** TILED & NON-SHARED MULTIPLICATION ON DEVICE *****/
__global__ void multiply_tiled_on_device_non_shared(float* A, float* B, float* C,
int num_A_rows, int num_A_columns,
int num_B_rows, int num_B_columns,
int num_C_rows, int num_C_columns)
{
// Set thread's coordinates.
int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
// Variable to store temporary results for current tiles.
float C_temp = 0.0;
// Iterate over each tile.
for(int i=0; i<ceil((float) num_A_columns/TILE_WIDTH); i++)
{
// Perform matrix multiplication for current tile.
for(int j=0; j<TILE_WIDTH && j<num_A_columns; j++)
{
if(row < num_C_rows &&
i * TILE_WIDTH + j < num_A_columns &&
col < num_C_columns &&
i * TILE_WIDTH + j < num_B_rows)
C_temp += A[row*num_A_columns + i*TILE_WIDTH + j] * B[(i*TILE_WIDTH + j)*num_C_columns + col];
}
}
// Set result in matrix C in global memory.
if(row<num_C_rows && col<num_C_columns)
{
C[row*num_C_columns + col] = C_temp;
}
}
/***** MULTIPLICATION ON HOST *****/
void multiply_on_host(float* A, float* B, float* C,
int num_A_rows, int num_A_columns,
int num_B_rows, int num_B_columns,
int num_C_rows, int num_C_columns)
{
// Iterate over rows.
for(int i=0; i<num_C_rows; i++)
{
// Iterate over columns.
for(int j=0; j<num_C_columns; j++)
{
// Compute one element in result array.
C[i*num_C_columns + j] = 0.0;
for(int k=0; k<num_A_columns; k++)
{
C[i*num_C_columns + j] += A[i*num_A_columns + k] * B[k*num_B_columns + j];
}
}
}
}
/***** UTILS *****/
// Creates an array (matrix) with x*y float entries with each entry between "min" and "max".
// PRNG is not secure. Indeed, two independent executions of this programme produce the same set of random numbers, because the PRNG's salt is static.
void create_matrix(float* m, int x, int y)
{
// Define minimal and maximal values.
float min = 1.0f;
float max = 9.9f;
float range = max - min;
// Fill the given matrix.
float r;
float div;
for(int i=0; i<x; i++)
{
for(int j=0; j<y; j++)
{
div = RAND_MAX / range;
r = min + (rand() / div);
*(m + j*x + i) = r;
}
}
}
// Compares two float arrays.
void check_result(float* host_computed_C, float* host_C, int x, int y, int device_type)
{
// Set comparison precision.
double epsilon = 1.0E-1;
// Boolean to toggle if result arrays to not match
bool ok = true;
// Iterate over rows.
for(int i=0; i<x; i++)
{
// Iterate over columns.
for(int j=0; j<y; j++)
{
if( (*(host_computed_C + i*y + j) - *(host_C + i*y + j)) > epsilon )
{
printf("HOST[%d][%d] = %f != %f = DEVICE[%d][%d]\n", i, j, *(host_computed_C + i*y + j), *(host_C + i*y + j), i, j);
ok = false;
}
}
}
if(device_type == 0)
{
printf("Shared memory: ");
}
else
{
printf("Non-shared memory: ");
}
if(ok)
{
printf("Result arrays ok.\n");
}
else
{
printf("Result arrays do NOT match!\n");
}
}
// Start a timer.
void timer_start(Timer* timer)
{
QueryPerformanceFrequency(&(timer->frequency));
QueryPerformanceCounter(&(timer->start));
}
// End a timer.
void timer_end(Timer* timer)
{
QueryPerformanceCounter(&(timer->end));
timer->elapsed_microseconds.QuadPart = timer->end.QuadPart - timer->start.QuadPart;
timer->elapsed_microseconds.QuadPart *= 1000000;
timer->elapsed_microseconds.QuadPart /= timer->frequency.QuadPart;
}