我的任务是使用共享内存在CUDA中转置矩阵,而不会发生存储体冲突。限制为:*高度<= 10 ^ 8。关键测试大小为:1x10 ^ 8、10 ^ 4x10 ^ 4、10 ^ 8 * 1。
我尝试了Matrix Transpose (with shared Memory) with arbitary size on Cuda C此处提供的解决方案,但对我没有帮助,因为我的矩阵太大,超出了CUDA尺寸限制(65536块和每个块32个线程)。
我试图创建一个循环,该循环有助于处理巨大的矩阵:
const int BLOCK_DIM = 32;
__global__ void transposeMatrixFast(double* inputMatrix, double* outputMatrix, int width, int height)
{
__shared__ double temp[BLOCK_DIM][BLOCK_DIM+1];
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
for (int y = yIndex; y < height; y += offsety)
{
for (int x = xIndex; x < width; x += offsetx)
{
if ((xIndex < width) && (yIndex < height))
{
int idx = y * width + x;
temp[threadIdx.y][threadIdx.x] = inputMatrix[idx];
}
__syncthreads();
if ((x < width) && (y < height))
{
int idx = x * height + y;
outputMatrix[idx] = temp[threadIdx.y][threadIdx.x];
}
}
}
}
现在,我在测试服务器上收到“超过时间限制”错误。原因是我不能在此行中使用共享内存的好处:
outputMatrix[idx] = temp[threadIdx.x][threadIdx.y];
,我的kerner放慢了脚步。我认为还有另一种组织循环的方法,但是我不知道如何。
答案 0 :(得分:0)
我找到了另一种组织循环的方法,现在我可以转置任何大小的矩阵了:
const int BLOCK_SIZE = 32;
__global__ void matrixTransposeSolveBankConflicts(const double *d_a, double *d_b, const unsigned long rows, const unsigned long cols) {
__shared__ double mat[BLOCK_SIZE][BLOCK_SIZE + 1];
unsigned long bh = ceil((double)rows / BLOCK_SIZE);
unsigned long bw = ceil((double)cols / BLOCK_SIZE);
for (unsigned long blocky = blockIdx.y; blocky < bh; blocky += gridDim.y) {
for (unsigned long blockx = blockIdx.x; blockx < bw; blockx += gridDim.x) {
unsigned long bx = blockx * BLOCK_SIZE;
unsigned long by = blocky * BLOCK_SIZE;
unsigned long i = by + threadIdx.y;
unsigned long j = bx + threadIdx.x;
if (i < rows && j < cols)
{
mat[threadIdx.x][threadIdx.y] = d_a[i*cols + j];
}
__syncthreads();
unsigned long ti = bx + threadIdx.y;
unsigned long tj = by + threadIdx.x;
if (tj < rows && ti < cols)
{
d_b[ti*rows + tj] = mat[threadIdx.y][threadIdx.x];
}
__syncthreads();
}
}
}