我正在尝试模拟cuda C中的矩阵乘法。除输出外,一切都是正确的。
这是我的计划:
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <conio.h>
#define N 4
#define TILE_WIDTH 2
__global__ void MatMul(int*A, int* B, int* C) {
int sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy , i;
uidx = bx*TILE_WIDTH + idx;
uidy = by*TILE_WIDTH + idy;
sum = 0;
// Allocating memory in shared memory
__shared__ int temp1[TILE_WIDTH][TILE_WIDTH];
__shared__ int temp2[TILE_WIDTH][TILE_WIDTH];
//copying the data to shared memory
for( i =0;i<N/TILE_WIDTH; i++)
{
temp1[idy][idx] = A[uidy * N + ((i*TILE_WIDTH)+uidx)%N];
temp2[idy][idx] = B[(i*TILE_WIDTH+uidy * N)%N + uidx];
__syncthreads();
// multiplying matrices in shared memory
for(k=0 ; k < TILE_WIDTH;k++) {
sum = sum + temp1[idy][k]*temp2[k][idx];
}
}
// synchronizing the threads
__syncthreads();
C[uidy*N + uidx] = sum;
}
int main( void ) {
int a[N][N], b[N][N], c[N][N]; //host copies of a,b,c
int *dev_a, *dev_b, *dev_c; //device copies of a,b,c
// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * N * sizeof(int) );
// fill the matrices 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
for (int j=0; j < N; j++) {
a[i][j] = j+3;
b[i][j] = i+6;
}
}
//copy above a,b values to device
cudaMemcpy( dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice );
// Prepare timer
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//start record
cudaEventRecord(start, 0);
// Kernel invocation with N threads
dim3 dimGrid(2,2,1);
dim3 dimBlock(TILE_WIDTH,TILE_WIDTH,1);
MatMul<<<dimGrid , dimBlock>>> (dev_a, dev_b, dev_c);
//stop record
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//this is operation time
cudaEventElapsedTime(&time, start, stop);
//clean up
cudaEventDestroy(start);
cudaEventDestroy(stop);
//copy result to host
cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost );
//output..
for (int i=0; i < N; i++){
for (int j=0; j < N; j++)
printf( "%d ", a[i][j]);
printf (" ");
for (int j=0; j < N; j++)
printf( "%d ", b[i][j]);
printf (" = ");
for (int j=0; j < N; j++)
printf( "%d ", c[i][j]);
printf ("\n");
}
//free the allocated memory in device
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
printf("\n multiplication done!!!\n");
printf("\n");
printf(" time elapsed in ms=%f\n",time);
getch();
return 0;
}
这是我的输出:
3 4 5 6 6 6 6 6 108 108 115 115
3 4 5 6 7 7 7 7 108 108 115 115
3 4 5 6 8 8 8 8 108 108 115 115
3 4 5 6 9 9 9 9 108 108 115 115
显示错误的值。请告诉我程序中的任何错误。我对CUDA C很新。
答案 0 :(得分:1)
虽然我不知道您的程序有什么问题,但我认为您应该能够使用更简单的矩阵更好地诊断它。您是否尝试过两个Identity矩阵的乘法?或者充满了所有1。使用各种简单矩阵的重复测试应该证明细胞是如何组合的。
最终,我认为您会发现使用TILE_WIDTH的方式存在问题,但我无法确定。
答案 1 :(得分:1)
这应该修复它(在 i 循环中):
temp1[idy][idx]= A[TILE_WIDTH*(by*N+i) + idx+idy*N];
temp2[idy][idx]= B[TILE_WIDTH*(bx+N*i) + idx+idy*N];