我正在尝试使用CUDA从较小的矩阵构建更大的矩阵。我的任务是,给定两个矩阵,一个矩阵应该复制到顶部,第二个矩阵应该复制到底部。当我尝试时,我可以复制顶部矩阵但底部无法复制(或给出意想不到的结果,即0)。我不想改变块的数量 请帮帮我
这是我的代码和
#include <stdio.h>
#include <stdlib.h>
#define N 5
#define BLOCK_DIM 3
__global__ void matrixCombine (int *a, int *b, int *c) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N) //copying upper matrix is working
c[index] = a[index];
if (col >= N && row >= N) //copying lower matrix is NOT working
c[index] = b[index];
}
void printMatrix(int a[N][N] )
{
for(int i=0; i<N; i++){
for (int j=0; j<N; j++){
printf("%d\t", a[i][j] );
}
printf("\n");
}
}
void printMatrixAns(int a[N][N] )
{
for(int i=0; i<2*N; i++){
for (int j=0; j<N; j++){
printf("%d\t", a[i][j] );
}
printf("\n");
}
}
int main() {
int a[N][N], b[N][N], c[2*N][N];
int *dev_a, *dev_b, *dev_c;
int size = N * N * sizeof(int);
int sizeofc = 2 * N * N * sizeof(int);
for(int i=0; i<N; i++)
for (int j=0; j<N; j++){
a[i][j] = rand() % 256;
b[i][j] = rand() % 256;
}
printf("Matrix A\n");
printMatrix(a);
printf("Matrix B\n");
printMatrix(b);
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, sizeofc);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);
printf("dimGrid.x = %d, dimGrid.y = %d\n", dimGrid.x, dimGrid.y);
matrixCombine<<<dimGrid,dimBlock>>>(dev_a,dev_b,dev_c);
cudaDeviceSynchronize();
cudaMemcpy(c, dev_c, sizeofc, cudaMemcpyDeviceToHost);
printf("Matrix c\n");
printMatrixAns(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
答案 0 :(得分:1)
在你的内核中,这个:
int index = col + row * N;
if (col < N && row < N) //copying upper matrix is working
c[index] = a[index];
if (col >= N && row >= N) //copying lower matrix is NOT working
c[index] = b[index];
错了。 a
和b
都只有NxN元素和c
2NxN元素,因此会导致b
和c
中的超出内存访问权限,并且你只需要比NxN线程稍微启动一样,所以无法保证网格大小足以覆盖2NxN输出元素。如果您将a
叠加在b
上的2NxN矩阵中,您需要这样的内容:
c[row][col] = a[row][col]
c[row+N][col] = b[row][col]
row<N
和col<N
。对于稍微超过NxN
个线程的网格,代码可能看起来像这样:
int index = col + row * N;
if (col < N && row < N) {
c[index] = a[index];
c[index + N*N] = b[index];
}
即。每个线程将a
元素和b
元素复制到c
,a
元素写入c[row][col]
,b
写入c[row+N][col]