我对CUDA编程很陌生,我想尝试使用Parallel Reduction实现矩阵乘法。我想出了这段代码,并希望澄清:
作为参考,我在NVIDIA GeForce GTX 675M上运行它,它的计算能力为2.1。
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include "cuda_runtime.h"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#define BLOCK_OPT 1024
typedef struct {
int width;
int height;
int stride;
float* elements;
}Matrix;
__global__ void MatMulOPT(Matrix A, Matrix B, Matrix C)
{
__shared__ float result[BLOCK_OPT];
int e = threadIdx.x;
int row = blockIdx.x;
int col = blockIdx.y;
result[e] = A.elements[row*A.stride + e] * B.elements[e*B.stride + col];
__syncthreads();
if (e < 512)
{
result[e] += result[e + 512];
}
__syncthreads();
if (e < 256)
{
result[e] += result[e + 256];
}
__syncthreads();
if (e < 128)
{
result[e] += result[e + 128];
}
__syncthreads();
if (e < 64)
{
result[e] += result[e + 64];
}
__syncthreads();
if (e < 32)
{
result[e] += result[e + 32];
result[e] += result[e + 16];
result[e] += result[e + 8];
result[e] += result[e + 4];
result[e] += result[e + 2];
result[e] += result[e + 1];
}
if (e == 0)C.elements[row*C.stride + col] = result[0];
}
void MatMulCPU(Matrix A, Matrix B, Matrix C)
{
for (int i = 0; i < A.height; i++)
{
for (int j = 0; j < B.width; j++)
{
for (int k = 0; k < B.height; k++)
{
C.elements[i*C.stride + j] += A.elements[i*A.stride+k] * B.elements[k*B.stride + j];
}
}
}
}
float randomFloat()
{
return (float)rand() / (float)RAND_MAX;
}
int main()
{
clock_t begin, end;
srand(time(NULL));
//Common Setup
float cpu_t = 0, gpu_t = 0;
int x = 1024;
int N = x * x;
size_t size = N * sizeof(float);
Matrix A;
A.width = x;
A.stride = x;
A.height = x;
A.elements = (float*)malloc(size);
for (int i = 0; i < N; i++)
A.elements[i] = randomFloat();
Matrix B;
B.width = x;
B.stride = x;
B.height = x;
B.elements = (float*)malloc(size);
for (int j = 0; j < N; j++)
B.elements[j] = randomFloat();
Matrix C;
C.width = x;
C.stride = x;
C.height = x;
C.elements = (float*)malloc(size);
for (int k = 0; k < N; k++)
C.elements[k] = 0;
Matrix D;
D.width = x;
D.stride = x;
D.height = x;
D.elements = (float*)malloc(size);
for (int l = 0; l < N; l++)
D.elements[l] = 0;
//Execute CPU code & time it
begin = clock();
MatMulCPU(A, B, D);
end = clock();
cpu_t = (float)end - begin;
// GPU setup
Matrix d_A, d_B, d_C;
d_A.width = x;
d_A.stride = x;
d_A.height = x;
d_B.width = x;
d_B.stride = x;
d_B.height = x;
d_C.width = x;
d_C.stride = x;
d_C.height = x;
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_C.elements, size);
cudaMemcpy(d_C.elements, C.elements, size, cudaMemcpyHostToDevice);
//Special Parameters
int optBlockSize = BLOCK_OPT;
dim3 optDimGrid(x, x);
//Execute GPU Kernel and time it
begin = clock();
cudaThreadSynchronize();
MatMulOPT << <optDimGrid, optBlockSize >> > (d_A, d_B, d_C);
cudaThreadSynchronize();
end = clock();
gpu_t = (float)end - begin;
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
//Do a memcheck
bool passed = true;
for (int k = 0; k < N; k++)
{
//printf("%f ",C.elements[k]);
if (fabs(C.elements[k] -D.elements[k] ) > 1e-6)
{
passed = false;
printf("\nFAIL\n");
printf("C[%d] = %f, D[%d] = %f\n",k,C.elements[k],k,D.elements[k]);
break;
}
}
printf("\n");
if (passed)
printf("PASSED\n");
printf("CPU Elapsed Time: %f\n", cpu_t);
printf("GPU Elapsed Time: %f\n", gpu_t);
//Clear all GPU memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
//Clear all CPU memory
free(A.elements);
free(B.elements);
free(C.elements);
}
答案 0 :(得分:3)
- 为什么代码返回的结果不正确。
醇>
在缩减的最后一步(e < 32
)中,你正在打破你的方法。这导致了相同结果元素的竞争条件,例如:
result[e] += result[e + 16];
对于e==0
,这意味着read result[16]
,而在e==16
的同一步骤/同时,操作意味着write result[16]
。
为避免竞争条件,您有两种选择:
volatile
链接(第78页)[已编辑] 像以前一样继续if( e < ...)
或将所有ifs转换为循环:
for(int size=blockdim.x/2; size>0; size/=2)
if(e < size)
...
- 为什么运行时间比CUDA C编程第25章第3章中描述的使用共享内存的方法要长得多 指南。
醇>
访问共享内存比访问全局内存要快得多。您将中间结果存储在共享内存中,而您引用的示例是存储要读取的矩阵部分。在示例中,这与循环平铺相结合,并且每个线程仅从全局内存加载整个tile的一个元素,但稍后读取TILE_WIDTH * 2
个元素。
示例的更高性能直接来自等待从全局内存加载数据的减少的时间。