// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
#include <time.h>
#define CLOCKS_PER_SEC ((clock_t)1000)
// Variables
float* h_A;
float* h_B;
float* h_C;
float* h_C_cpu;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
void ZeroInit(float*, int);
// Device code
__global__ void MatrixMul(const float*A,const float*B,float*C,int Arow,int Acol,int Bcol)
{
int coli= blockDim.x * blockIdx.x + threadIdx.x;
int rowi= blockDim.y * blockIdx.y + threadIdx.y;
float tmp=0;
C[rowi*Bcol+coli]=0;
for(int k=0;k<Acol;k++)
{
if(rowi<Arow&&coli<Bcol)
C[rowi*Bcol+coli]+=A[rowi*Acol+k]*B[k*Bcol+coli];
}
//__syncthreads();
//C[rowi*Bcol+coli]=tmp;
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
clock_t start,end;
double duration;
printf("Vector Addition\n");
int a_row=800,a_col=600,b_row=600,b_col=900;
int a_size =a_row*a_col* sizeof(float);
int b_size=b_row*b_col*sizeof(float);
int c_size=a_row*b_col*sizeof(float);
//const int matrixrow=10000,matrixcol=10000;
h_A=(float*)malloc(a_size);
h_B=(float*)malloc(b_size);
h_C=(float*)malloc(c_size);
h_C_cpu=(float*)malloc(c_size);
RandomInit(h_A, a_size/sizeof(float));
RandomInit(h_B, b_size/sizeof(float));
//memset(h_C,0,c_size);
ZeroInit(h_C,c_size/sizeof(float));
//memset(h_C_cpu,0,c_size);
ZeroInit(h_C_cpu,c_size/sizeof(float));
//RandomInit(h_C, c_size);
start=clock();
int i,j,k;
for(i=0;i<a_row;i++)
{
for(j=0;j<b_col;j++)
{
for(k=0;k<a_col;k++)
{
h_C_cpu[i*b_col+j]+=h_A[i*a_col+k]*h_B[k*b_col+j];
}
}
}
end=clock();
duration=double(end-start)/CLOCKS_PER_SEC;
printf("CPU time: %lf\n",duration);
cutilSafeCall(cudaMalloc((void**)&d_A,a_size));
cutilSafeCall(cudaMalloc((void**)&d_B,b_size));
cutilSafeCall(cudaMalloc((void**)&d_C,c_size));
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
/*h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();*/
// Initialize input vectors
// Allocate vectors in device memory
/*cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );*/
start=clock();
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, a_size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, b_size, cudaMemcpyHostToDevice) );
// Invoke kernel
//int threadsPerBlock = 1024;
dim3 dimblock(32,32);
int blockx = (b_col + dimblock.x - 1) /dimblock.x;
int blocky = (a_row + dimblock.y - 1) /dimblock.y;
dim3 dimgrid(blockx,blocky);
MatrixMul<<<dimgrid, dimblock>>>(d_A,d_B,d_C,a_row,a_col,b_col);
//myVecAdd<<<1,threadsPerBlock>>>(d_A,d_B,d_C,N);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, c_size, cudaMemcpyDeviceToHost) );
end=clock();
duration=double(end-start)/CLOCKS_PER_SEC;
printf("GPU time: %lf\n",duration);
// Verify result
for (i = 0; i < a_row*b_col; ++i) {
//float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - h_C_cpu[i]) > 1e-5)
{
//printf("The result is wrong!\n");
break;
}
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==a_row*b_col) ? QA_PASSED : QA_FAILED);
}
void ZeroInit(float* a, int N)
{
for(int i=0;i<N;i++)
a[i]=0;
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
上面是我的CUDA代码:“MatrixMul.cu”,我的项目只有这个文件,我把它写在SDK,VectorAdd项目中,只需修改它。我在一个 cu 文件中编写了内核函数和main函数。
我将结果与我的CPU结果进行了比较,发现它并不相同。另一个问题是,当我使用tmp变量而不是C [rowi * matrixcol + coli]时,它也是错误的,我也不知道为什么?
答案 0 :(得分:2)
我发现您的代码中有2个问题。首先,在你的内核中,你没有正确地调整有效线程索引上的代码。有效的线程索引是对应于实际结果矩阵元素的索引。无效的线程索引是在该区域之外的索引。你有一个检查,但它在代码中的错误位置。而不是:
C[rowi*Bcol+coli]=0;
for(int k=0;k<Acol;k++)
{
if(rowi<Arow&&coli<Bcol)
C[rowi*Bcol+coli]+=A[rowi*Acol+k]*B[k*Bcol+coli];
}
使用此:
if(rowi<Arow&&coli<Bcol) {
C[rowi*Bcol+coli]=0;
for(int k=0;k<Acol;k++)
{
C[rowi*Bcol+coli]+=A[rowi*Acol+k]*B[k*Bcol+coli];
}
}
由于编写代码的方式,有效范围之外的某些线程将某些元素归零,因为它们不应该是在有效线程检查之前的这行代码所致:
C[rowi*Bcol+coli]=0;
我发现的第二个问题是您的等效检查可能太紧了。你有这个:
if (fabs(h_C[i] - h_C_cpu[i]) > 1e-5)
我把它更改为:
if (fabs(h_C[i] - h_C_cpu[i]) > 1e-4)
通过上述更改,我得到了匹配的结果。您可以使用等效性检查来查看有多少匹配数字,但是您的数字对于32位浮点数量而言期望太多匹配数字。您在这里的剩余支票未按比例缩放,因此您不能像您想象的那样紧张。如果您创建了缩放残差检查,那么您可以确定检查每个元素的给定精度。
作为进一步的建议,在结果比较循环中,我将更改以下行:
//printf("The result is wrong!\n");
要:
printf("The result is wrong at idx: %d CPU: %f GPU: %f\n", i, h_C_cpu[i], h_C[i]);
为了获得更有用的结果,如果你想进一步使用它,那就会出错。