CUDA就地转置不完成转置总矩阵

时间:2015-03-07 11:10:47

标签: cuda

我已经在下面写了CUDA代码。它应该使用平铺块来转置矩阵,并且代码在使用小值时有效,但在使用时,例如: TILE = 32,矩阵128 x 128,它没有完成转置,它在96之后停止。在主机中这是我的维度线程/块

dim3 dimGrid((nEven + TILE_DIM - 1) / TILE_DIM, (nEven + TILE_DIM - 1) / TILE_DIM); 
dim3 dimBlock(TILE_DIM, TILE_DIM);

其中我让线程号==来平铺块号, 全局代码很简单,理论上应该可行:

__global__ void transposeMain( int *idata)
{
    __shared__ int tile2[TILE_DIM][TILE_DIM];

    int yyy = blockIdx.y * TILE_DIM ; // col values (0,32,64,96)
    int xxx = blockIdx.x * TILE_DIM ; // row values (0,32,64,96)

    if (xxx < nEven && yyy < nEven) 
    {
        tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];

        __syncthreads();

        idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];

    }
}

知道可能是什么问题吗?

2 个答案:

答案 0 :(得分:1)

问题是你正在尝试进行就地转置。

CUDA设备代码执行被分解为线程块。线程块(线程组)可以按任何顺序执行,并不是所有(通常)同时执行。所以当你在这里阅读一块瓷砖时:

tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];

没关系。但是当你写瓷砖时:

idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];

您经常覆盖尚未阅读的数据(在原始矩阵中的某些其他图块中)(因为负责阅读该图块的线程块甚至尚未开始执行然而)。一旦你像这样覆盖它,它就会丢失。

解决方案(用于方阵转置)有几个方面:

  1. 每个线程块必须首先读取2个图块。输入数据中的这2个图块将被交换。
  2. 然后每个线程块都可以写出这两个图块。
  3. 主对角线上的瓷砖需要特殊的外壳。
  4. 由于大多数线程块正在处理2个区块,因此只有主对角线一侧或上方的线程块需要做任何工作。
  5. 您没有显示完整的MCVE(当您遇到此类问题时expected),并且您的代码还存在其他问题,例如未合并访问的可能性(性能较低),因此我我不会试图“修复”你的代码。

    相反,这是一个完全有效的例子,取自here

    $ cat t469.cu
    #include <stdio.h>
    #include <cublas_v2.h>
    #include <time.h>
    #include <sys/time.h>
    #define uS_PER_SEC 1000000
    #define uS_PER_mS 1000
    #define N 4096
    #define M 4096
    #define TILE_DIM 32
    #define BLOCK_ROWS 8
    
    
    
    __global__ void transposeCoalesced(float *odata, const float *idata)
    {
      __shared__ float tile[TILE_DIM][TILE_DIM+1];
    
      int x = blockIdx.x * TILE_DIM + threadIdx.x;
      int y = blockIdx.y * TILE_DIM + threadIdx.y;
      int width = gridDim.x * TILE_DIM;
    
      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
         tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
    
      __syncthreads();
    
      x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
      y = blockIdx.x * TILE_DIM + threadIdx.y;
    
      for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
         odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
    }
    
    __global__ void iptransposeCoalesced(float *data)
    {
      __shared__ float tile_s[TILE_DIM][TILE_DIM+1];
      __shared__ float tile_d[TILE_DIM][TILE_DIM+1];
    
      int x = blockIdx.x * TILE_DIM + threadIdx.x;
      int y = blockIdx.y * TILE_DIM + threadIdx.y;
      int width = gridDim.x * TILE_DIM;
    
      if (blockIdx.y>blockIdx.x) { // handle off-diagonal case
        int dx = blockIdx.y * TILE_DIM + threadIdx.x;
        int dy = blockIdx.x * TILE_DIM + threadIdx.y;
        for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
          tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
        for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
          tile_d[threadIdx.y+j][threadIdx.x] = data[(dy+j)*width + dx];
        __syncthreads();
        for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
          data[(dy+j)*width + dx] = tile_s[threadIdx.x][threadIdx.y + j];
        for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
          data[(y+j)*width + x] = tile_d[threadIdx.x][threadIdx.y + j];
      }
    
      else if (blockIdx.y==blockIdx.x){ // handle on-diagonal case
        for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
          tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
        __syncthreads();
        for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
          data[(y+j)*width + x] = tile_s[threadIdx.x][threadIdx.y + j];
      }
    }
    
    
    int validate(const float *mat, const float *mat_t, int n, int m){
       int result = 1;
       for (int i = 0; i < n; i++)
         for (int j = 0; j < m; j++)
           if (mat[(i*m)+j] != mat_t[(j*n)+i]) result = 0;
       return result;
    }
    
    int main(){
    
        timeval t1, t2;
        float *matrix = (float *) malloc (N * M * sizeof(float));
        for (int i = 0; i < N; i ++)
          for (int j = 0; j < M; j++)
            matrix[(i*M) + j] = i;
    // Starting the timer
        gettimeofday(&t1, NULL);
        float *matrixT = (float *) malloc (N * M * sizeof(float));
        for (int i = 0; i < N; i++)
            for (int j = 0; j < M; j++)
                matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
    //Ending the timer
        gettimeofday(&t2, NULL);
        if (!validate(matrix, matrixT, N, M)) {printf("fail!\n"); return 1;}
        float et1 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
        printf("CPU time = %fms\n", et1);
    
        float *h_matrixT , *d_matrixT , *d_matrix;
        h_matrixT = (float *) (malloc (N * M * sizeof(float)));
        cudaMalloc((void **)&d_matrixT , N * M * sizeof(float));
        cudaMalloc((void**)&d_matrix , N * M * sizeof(float));
        cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice);
    
    //Starting the timer
        gettimeofday(&t1, NULL);
    
        const float alpha = 1.0;
        const float beta  = 0.0;
        cublasHandle_t handle;
        //gettimeofday(&t1, NULL);
        cublasCreate(&handle);
        gettimeofday(&t1, NULL);
        cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
        cudaDeviceSynchronize();
        gettimeofday(&t2, NULL);
        cublasDestroy(handle);
    
    //Ending the timer
        float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
        printf("GPU Sgeam time = %fms\n", et2);
    
        cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
        if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
        cudaMemset(d_matrixT,0, N*M*sizeof(float));
        memset(h_matrixT, 0, N*M*sizeof(float));
        dim3 threads(TILE_DIM, BLOCK_ROWS);
        dim3 blocks(N/TILE_DIM, M/TILE_DIM);
        gettimeofday(&t1, NULL);
        transposeCoalesced<<<blocks, threads >>>(d_matrixT, d_matrix);
        cudaDeviceSynchronize();
        gettimeofday(&t2, NULL);
        cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
        if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
        float et3 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
        printf("GPU kernel time = %fms\n", et3);
    
        memset(h_matrixT, 0, N*M*sizeof(float));
        gettimeofday(&t1, NULL);
        iptransposeCoalesced<<<blocks, threads >>>(d_matrix);
        cudaDeviceSynchronize();
        gettimeofday(&t2, NULL);
        cudaMemcpy(h_matrixT , d_matrix , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
        if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
        float et4 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
        printf("GPU in-place kernel time = %fms\n", et4);
    
    
        cudaFree(d_matrix);
        cudaFree(d_matrixT);
        return 0;
    }
    $ nvcc -arch=sm_20 -o t469 t469.cu -lcublas
    $ ./t469
    CPU time = 450.095001ms
    GPU Sgeam time = 1.937000ms
    GPU kernel time = 1.694000ms
    GPU in-place kernel time = 1.839000ms
    $
    

    请注意,这会比较几种不同的矩阵转置方法。 如果您研究iptransposeCoalesced,您会发现它符合我在上面概述的4个具体方面。

答案 1 :(得分:0)

在CUDA的if语句中使用__syncthreads();是很可疑的。尝试通过简单的方法将其移到此块之外:

if (xxx < nEven && yyy < nEven) 
{
    tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
}
__syncthreads();
if (xxx < nEven && yyy < nEven) 
{
    idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}