CUDA矩阵转置共享内存

时间:2016-11-22 17:38:20

标签: c++ matrix cuda transpose

我需要使用共享内存在GPU上实现矩阵转置功能。我已经以一种简单的方式完成了它,没有共享内存,工作正常,也是SM的尝试。但不幸的是,计算不正确,我无法弄清楚原因。可以找到一个完整的工作示例here并在此问题的底部。

编辑1

我进一步知道我得到错误值的结果的第一个索引是索引32(平面矩阵,所以matr[0][32]是二维方式)。

如果还有更多信息,我会高兴地为他们提供。

下面列出了整个代码中类似于不工作函数的简短摘录:

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width,
    const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int blockIdx_y = blockIdx.x;
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)* width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)* height;

    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
               matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
    }
}

输出如下:

Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s

Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s
    Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s
    Correct: 12352, Wrong: 37648

这是完整的工作示例。我从中删除了大部分不必要的代码,因此填充的更少:

#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
    int i, width, height, nreps, size, wrong, correct;
    double cpuTime, cpuBandwidth;
    cudaError_t cudaStatus;

    float *matrA, *matrATC, *matrATG, *matrAC;

    srand(time(NULL));

    nreps = 10000;
    width = 500;
    height = 100;
    size = width * height;

    matrA = (float*)malloc(size * sizeof(float)); // matrix A
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

    for (i = 0; i < size; i++)
    {
        matrA[i] = (float)i;
    }

    auto start = std::chrono::high_resolution_clock::now();

    //CPU Transpose
    cpuMatrTrans(matrATC, matrA, width, height, nreps);

    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000) / nreps;
    cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    correct = 0;
    wrong = 0;

    //Naive transpose
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    //Transpose with shared memory
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
    float elapsed = 0;
    float *dev_matrA = 0;
    float *dev_matrB = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;
    double gpuBandwidth;

    int size = width * height;

    dim_block.x = TILE_DIM;
    dim_block.y = BLOCK_ROWS;
    dim_block.z = 1;

    dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
    dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    switch (operation)
    {
        case(1):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

        case(2):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

    default:
        printf("No matching opcode was found.\n");
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrB);
    cudaFree(dev_matrA);

    return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, j, r;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < height; i++)
#pragma unroll
            for (j = 0; j < width; j++)
                matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, r;
    int row = blockIdx.x * TILE_DIM + threadIdx.x;
    int col = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = row + width * col;
    int index_out = col + height * row;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
                matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int blockIdx_y = blockIdx.x;
    int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
    int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
    int index_in = xIndex + (yIndex)* width;

    xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
    yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
    int index_out = xIndex + (yIndex)* height;

    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if (index_in + i * width < width * height)
               matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
    }
}

1 个答案:

答案 0 :(得分:1)

此代码存在许多问题。我不确定我能否涵盖所有这些。

可能最重要的问题是您缺乏(并且缺乏理解)正确的2D线程检查。您的算法会创建一个线程网格,其在两个维度上都比问题大小更大。这会在两个维度中创建矩阵维度的逻辑线程

您试图像这样创建2D线程检查:

        if (index_in + i * width < width * height)

这不起作用。假设我有一个3x3矩阵和一个4x4网格线程。 (3,0)处的线程明显超出了矩阵的范围,但会通过2D线程检查。

在这种情况下,正确的线程检查必须单独测试每个维度,而不是产品。

请注意,此“逻辑错误”也存在于“天真”转置内核中,如果您使用cuda-memcheck运行代码,则可以确认。它将指示该内核中的越界访问错误,即使它似乎正常工作。

还有其他各种问题。其中大部分都与共享内存内核中的索引有关。我不清楚您是否理解shared memory transpose的必要索引操作。在这种情况下,我们必须做两个单独的索引转置:

  1. 转置块(平铺)索引
  2. 转置线程索引
  3. 线程索引的转置是在读/写共享内存时完成的。通过使用threadIdx.xthreadIdx.y来反转共享内存的读/写,可以正确地解决这个问题。但就我所知,你的索引生成用于反转块索引(在读取/写入全局内存期间使用的反转)被打破了。这是另一个需要解决的主要问题。

    以下代码修复了这些以及其他一些问题,并且对我来说似乎正常工作:

    $ cat t33.cu
    #include "cuda_runtime.h"
    #include "device_functions.h"
    #include "device_launch_parameters.h"
    
    #include <chrono>
    #include <time.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    #define TILE_DIM 32
    #define BLOCK_ROWS 8
    #define BLOCK_COLS 32
    
    cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
    void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
    __global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
    __global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
    
    int main()
    {
        int i, width, height, nreps, size, wrong, correct;
        double cpuTime, cpuBandwidth;
        cudaError_t cudaStatus;
    
        float *matrA, *matrATC, *matrATG, *matrAC;
    
        srand(time(NULL));
    
        nreps = 10000;
        width = 500;
        height = 100;
    
    
        size = width * height;
    
        matrA = (float*)malloc(size * sizeof(float)); // matrix A
        matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
        matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
        matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU
    
        for (i = 0; i < size; i++)
        {
            matrA[i] = (float)i;
        }
    
        auto start = std::chrono::high_resolution_clock::now();
    
        //CPU Transpose
        cpuMatrTrans(matrATC, matrA, width, height, nreps);
    
        auto end = std::chrono::high_resolution_clock::now();
    
        std::chrono::duration<double> diff = end - start;
        cpuTime = (diff.count() * 1000) / nreps;
        cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
        printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);
    
        correct = 0;
        wrong = 0;
    
        //Naive transpose
        memset(matrATG, 0, size*sizeof(float));
        matrMagicCuda(matrATG, matrA, width, height, nreps, 1);
    
        //Check if calc was correct
        for (i = 0; i < size; i++)
        {
            if (matrATC[i] != matrATG[i])
            {
                /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
                return;*/
                wrong++;
            }
            else
            {
                correct++;
            }
        }
    
        printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
        correct = 0;
        wrong = 0;
    
        //Transpose with shared memory
        memset(matrATG, 0, size*sizeof(float));
        matrMagicCuda(matrATG, matrA, width, height, nreps, 2);
    
        //Check if calc was correct
        for (i = 0; i < size; i++)
        {
            if (matrATC[i] != matrATG[i])
            {
                /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
                return;*/
                wrong++;
            }
            else
            {
                correct++;
            }
        }
    
        //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
        printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
        correct = 0;
        wrong = 0;
    
        // cudaDeviceReset must be called before exiting in order for profiling and
        // tracing tools such as Nsight and Visual Profiler to show complete traces.
        cudaStatus = cudaDeviceReset();
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaDeviceReset failed!\n");
            return 1;
        }
    
        return 0;
    }
    
    cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
    {
        float elapsed = 0;
        float *dev_matrA = 0;
        float *dev_matrB = 0;
        cudaError_t cudaStatus;
        dim3 dim_grid, dim_block;
        double gpuBandwidth;
    
        int size = width * height;
    
        dim_block.x = TILE_DIM;
        dim_block.y = BLOCK_ROWS;
        dim_block.z = 1;
    
        dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
        dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
        dim_grid.z = 1;
    
        // Choose which GPU to run on, change this on a multi-GPU system.
        cudaStatus = cudaSetDevice(0);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
            goto Error;
        }
    
        // Allocate GPU buffers for three matrix
        cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMalloc failed!");
            goto Error;
        }
    
        cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMalloc failed!");
            goto Error;
        }
    
        // Copy input matrix from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        cudaMemset(dev_matrB, 0, size * sizeof(float));
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    
        switch (operation)
        {
            case(1):
            {
                cudaEventRecord(start);
                // Launch a kernel on the GPU with one thread for each element.
                naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);
    
                cudaEventRecord(stop);
                cudaEventSynchronize(stop);
    
                cudaEventElapsedTime(&elapsed, start, stop);
                cudaEventDestroy(start);
                cudaEventDestroy(stop);
    
                elapsed /= nreps;
    
                gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
                printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
    
                break;
            }
    
            case(2):
            {
                cudaEventRecord(start);
                // Launch a kernel on the GPU with one thread for each element.
                notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);
    
                cudaEventRecord(stop);
                cudaEventSynchronize(stop);
    
                cudaEventElapsedTime(&elapsed, start, stop);
                cudaEventDestroy(start);
                cudaEventDestroy(stop);
    
                elapsed /= nreps;
    
                gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
                printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);
    
                break;
            }
    
        default:
            printf("No matching opcode was found.\n");
        }
    
        // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
            goto Error;
        }
    
        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
            goto Error;
        }
    
        // Copy output matrix from GPU buffer to host memory.
        cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
    
    Error:
        cudaFree(dev_matrB);
        cudaFree(dev_matrA);
    
        return cudaStatus;
    }
    
    void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
    {
        int i, j, r;
    
    #pragma unroll
        for (r = 0; r < nreps; r++)
    #pragma unroll
            for (i = 0; i < height; i++)
    #pragma unroll
                for (j = 0; j < width; j++)
                    matrB[j * height + i] = matrA[i * width + j];
    }
    
    __global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
    {
        int i, r;
        int col = blockIdx.x * TILE_DIM + threadIdx.x;
        int row = blockIdx.y * TILE_DIM + threadIdx.y;
        int index_in = col + width * row;
        int index_out = row + height * col;
    
    #pragma unroll
        for (r = 0; r < nreps; r++)
    #pragma unroll
            for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
                if ((row+i<height) && (col < width))
                    matrB[index_out + i] = matrA[index_in + i * width];
    }
    
    __global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
    {
        __shared__ float tile[TILE_DIM][TILE_DIM + 1];
        int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x;
        int riIndex = blockIdx.y * TILE_DIM + threadIdx.y;
        int coIndex = blockIdx.y * TILE_DIM + threadIdx.x;
        int roIndex = blockIdx.x * TILE_DIM + threadIdx.y;
        int index_in = ciIndex + (riIndex)* width;
        int index_out = coIndex + (roIndex)* height;
    
    
        int r, i;
    #pragma unroll
        for (r = 0; r < nreps; r++)
        {
    #pragma unroll
            for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
                if ((ciIndex<width) && (riIndex+i < height))
                  tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
            __syncthreads();
    
    #pragma unroll
            for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
                if ((coIndex<height) && (roIndex+i < width))
                   matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
            __syncthreads();
        }
    }
    $ nvcc -std=c++11 -arch=sm_61 -o t33 t33.cu
    t33.cu(25): warning: variable "matrAC" was set but never used
    
    t33.cu(25): warning: variable "matrAC" was set but never used
    
    $ cuda-memcheck ./t33
    ========= CUDA-MEMCHECK
    Avg. CPU Transpose Time: 0.143087 ms, Bandwidth: 2.795509 GB/s
    
    Avg. GPU Naive Trans Time: 0.028587 ms, bandwidth: 13.992195 GB/s
            Correct: 50000, Wrong: 0
    
    Avg. GPU Trans with SM Time: 0.040328 ms, bandwidth: 9.918678 GB/s
            Correct: 50000, Wrong: 0
    
    ========= ERROR SUMMARY: 0 errors
    $ ./t33
    Avg. CPU Transpose Time: 0.140469 ms, Bandwidth: 2.847594 GB/s
    
    Avg. GPU Naive Trans Time: 0.003828 ms, bandwidth: 104.505440 GB/s
            Correct: 50000, Wrong: 0
    
    Avg. GPU Trans with SM Time: 0.000715 ms, bandwidth: 559.206604 GB/s
            Correct: 50000, Wrong: 0
    
    $
    

    注意:代码尝试测量带宽。但是,您应该知道此处测量的带宽受缓存带宽的影响。这里的矩阵大小(500x100 =每个输入和输出200K字节)很容易小到足以容纳大多数GPU上的L2缓存。这个事实,加上你多次运行相同的转置(nreps)这一事实意味着大部分工作都直接在L2缓存之外运行。因此,在上面的“优化”情况下,我们看到报告的带宽数量大大超过了GPU的可用内存带宽(这种情况恰好是Pascal Titan X,因此可用的主内存带宽约为340GB / s)。这是因为该测量包括L2高速缓存的一些好处,其带宽至少是主存储器带宽的两倍。您可以通过使用明显更大的矩阵大小和/或将nreps减少到1来消除此影响。