CUDA 8.0 - cudaMemcpy() - 线性或恒定时间操作?

时间:2017-08-08 11:45:22

标签: cuda

CUDA 8.0 cudaMemcpy()是否同时或逐字节地复制整个内存块?

我想限制复制时间,但我在文档中找不到任何指定cudaMemcpy()是线性还是恒定时间操作的内容。

1 个答案:

答案 0 :(得分:1)

同步内存传输不是固定时间,而是具有固定延迟组件和与传输大小成比例的组件。在小尺寸下,延迟占主导地位,在大尺寸下,限制传输速度受到内存或总线带宽的限制。

考虑以下琐碎的基准:

#include <iostream>
#include <string>
#include <algorithm>

__global__ void memsetkernel(int *x, int n)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    for(; tid < n; tid += stride) {
        x[tid] = threadIdx.x;
    }
}

int main(int argc, char* argv[])
{
    // size
    int n = 100;
    int nreps = 10;

    if (argc > 1) {
       n = std::stoi(std::string(argv[1]));
    }

    size_t sz = sizeof(int) * size_t(n);

    // host array
    int* host = new int[n];

    // allocate size ints on device
    int* device;
    cudaMalloc(&device, sz);
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    {
        int nthreads = 1024;
        int nblocks = std::max(1, std::min(13*2, n / nthreads));
        memsetkernel<<<nblocks, nthreads>>>(device, n);
        cudaDeviceSynchronize();
        cudaEventRecord(start);
        for(int i=0; i<nreps; i++) {
            memsetkernel<<<nblocks, nthreads>>>(device, n);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds, kilobytes, bandwidth;
        cudaEventElapsedTime(&milliseconds, start, stop);
        milliseconds /= float(nreps); // Average of nreps
        kilobytes = float(sz) / 1e3f;
        bandwidth = kilobytes / milliseconds;        
        std::cout << "kernel assignment: " << bandwidth << " Mb/s" << std::endl; 
    }

    {
        cudaMemcpy(host, device, sz, cudaMemcpyDeviceToHost);
        cudaEventRecord(start);
        for(int i=0; i<nreps; i++) {
            cudaMemcpy(host, device, sz, cudaMemcpyDeviceToHost);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds, kilobytes, bandwidth;
        cudaEventElapsedTime(&milliseconds, start, stop);
        milliseconds /= float(nreps); // Average of nreps
        kilobytes = float(sz) / 1e3f;
        bandwidth = kilobytes / milliseconds;        
        std::cout << "DTOH: " << bandwidth << " Mb/s" << std::endl; 
    }

    {
        cudaMemcpy(device, host, sz, cudaMemcpyHostToDevice);
        cudaEventRecord(start);
        for(int i=0; i<nreps; i++) {
            cudaMemcpy(device, host, sz, cudaMemcpyHostToDevice);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float milliseconds, kilobytes, bandwidth;
        cudaEventElapsedTime(&milliseconds, start, stop);
        milliseconds /= float(nreps); // Average of nreps
        kilobytes = float(sz) / 1e3f;
        bandwidth = kilobytes / milliseconds;
        std::cout << "HTOD: " << bandwidth << " Mb/s" << std::endl; 
    }

    // reset device
    cudaDeviceReset();

}

以不同的数据大小运行此操作会显示以下行为:

enter image description here

设备到主机和主机到设备都渐近接近所讨论机器的PCI-e总线带宽的60%左右(约6.5 Gb / s,使用固定主机可以达到更高的值)内存),内核达到GPU主存储器带宽的70%左右(150 Gb / s,理论最大带宽约为224Gb / s)。

NVIDIA提供了一个用于测量传输带宽的示例,您可以阅读here。您可以使用它来自己探索硬件的性能。