Question

有谁能帮我理解memCopy2dA和memCopy2dB内核之间的性能差异？

他们应该将尺寸为xLen，yLen的2D数据从一个地方复制到另一个地方但是他们使用不同的策略：

当使用memCopy2dA时，块/线程覆盖整个2D空间，因为此内核假设只复制一个数据点
当使用memCopy2dB时，只为整个X行创建块/线程，然后每个内核在Y方向上循环以复制所有数据。

根据profiler（nvvp），在两种情况下，GPU访问内存模式为100％，X维度足以使“B”内核（Titan X，24SM）的设备饱和。不幸的是，“B”内核速度较慢，而我的机器结果是：

GB/s: 270.715
GB/s: 224.405

附加问题：是否有可能接近理论内存带宽限制，即336.48 GB / s（3505MHz * 384位* 2/8）？至少我的测试显示最大值总是在271-272 GB / s左右。

测试代码：

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <chrono>

template<typename T>
__global__ void memCopy2dA(T *in, T *out, size_t xLen, size_t yLen) {
    int xi = blockIdx.x * blockDim.x + threadIdx.x;
    int yi = blockIdx.y * blockDim.y + threadIdx.y;
    if (xi < xLen && yi < yLen) {
        out[yi * xLen + xi] = in[yi * xLen + xi];
    }
}

template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
    int xi = blockIdx.x * blockDim.x + threadIdx.x;
    if (xi < xLen) {
        size_t idx = xi;
        for (int y = 0; y < yLen; ++y) {
            out[idx] = in[idx];
            idx += xLen;
        }
    }
}

static void waitForCuda() {
    cudaDeviceSynchronize();
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(err));
}

int main() {
    typedef float T;

    size_t xLen = 24 * 32 * 64; //49152
    size_t yLen = 1024;
    size_t dataSize = xLen * yLen * sizeof(T);

    T *dInput;
    cudaMalloc(&dInput, dataSize);
    T *dOutput;
    cudaMalloc(&dOutput, dataSize);

    const int numOfRepetitions = 100;
    double gigabyte = 1000 * 1000 * 1000;
    {
        dim3 threadsPerBlock(64, 1);
        dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (yLen + threadsPerBlock.y - 1) / threadsPerBlock.y);

        auto startTime = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < numOfRepetitions; ++i) {
            memCopy2dA <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
            waitForCuda();
        }
        auto stopTime = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = stopTime - startTime;
        std::cout << "GB/s: " << (2 * dataSize * numOfRepetitions) / elapsed.count() / gigabyte << std::endl;
    }
    {
        dim3 threadsPerBlock(64);
        dim3 numBlocks((xLen + threadsPerBlock.x - 1) / threadsPerBlock.x);

        auto startTime = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < numOfRepetitions; ++i) {
            memCopy2dB <<< numBlocks, threadsPerBlock >>> (dInput, dOutput, xLen, yLen);
            waitForCuda();
        }
        auto stopTime = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> elapsed = stopTime - startTime;
        std::cout << "GB/s: " << ((2 * dataSize * numOfRepetitions) / elapsed.count()) / gigabyte << std::endl;
    }

    cudaFree(dInput);
    cudaFree(dOutput);

    return 0;
}

编译：

nvcc -std=c++11 memTest.cu -o memTest

Answer 1

我找到了一个如何加速memCopy2dB内核的解决方案。这是在1080Ti上执行的测试（TITAN X不再适用于我）。问题部分的代码会产生以下结果：

GB/s: 365.423
GB/s: 296.678

或多或少与先前在Titan X上观察到的百分比差异相同。现在修改后的memCopy2dB内核看起来像：

template<typename T>
__global__ void memCopy2dB(T *in, T *out, size_t xLen, size_t yLen) {
    int xi = blockIdx.x * blockDim.x + threadIdx.x;
    if (xi < xLen) {
        size_t idx = xi;
        for (int y = 0; y < yLen; ++y) {
            __syncthreads();  // <------ this line added
            out[idx] = in[idx];
            idx += xLen;
        }
    }
}

关于当warp中的所有线程都应访问相同的对齐内存段时，warp级别上的合并内存操作有多重要，有很多信息。但是，似乎可以在一个块中同步扭曲，从而可以在扭曲之间进行合并，这可能是利用不同GPU上更好的内存总线宽度实现的。<-这只是我对这个问题的“解释”，因为我没有找到关于此问题的文献。

无论如何添加这条不需要的行（由于代码逻辑，我不需要同步扭曲），这两个内核的结果如下：

GB/s: 365.255
GB/s: 352.026

因此，即使同步使代码执行速度变慢，我们也可以获得更好的结果。我在我的一些代码上尝试了这种技术，该代码以memCopy2dB访问模式的方式处理数据，它给了我很好的加速效果。

两个CUDA内核的内存复制 - 为什么速度不同？

1 个答案: