我最近开始使用OpenCl,我试图改变我的CUDA代码,将平铺矩阵乘法转换为OpenCl。我做了一些改变,但我不确定我是否做得正确。我不知道opencl中的blockIdx,threadIdx,__syncthreads,Ashare和Bshare是什么。 如果有人能帮助我,我会非常高兴。
我的CUDA平铺矩阵乘法的内核代码:
#define TILE_WIDTH 16
__global__ void matrixMulKernel(float* A, float* B, float* C, int width) {
__shared__ float Ashare[TILE_WIDTH][TILE_WIDTH];
__shared__ float Bshare[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
//calculate the row and column for this element of the matrix
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
float result = 0;
//loop over the A and B tiles required to compute the C element
for (int m = 0; m < width / TILE_WIDTH; m++) {
//collectively load the A and B tiles into shared memory
Ashare[ty][tx] = A[(row * width) + (m * TILE_WIDTH) + tx];
Bshare[ty][tx] = B[(((m * TILE_WIDTH) + ty) * width) + col];
__syncthreads(); //wait for all the shared memory to be loaded
for (int k = 0; k < TILE_WIDTH; k++) {
result += A[ty][k] * B[k][tx];
}
__syncthreads();
}
C[(row * width) + col] = result;
}