CUDA 8.0 cudaMemcpy()
是否同时或逐字节地复制整个内存块?
我想限制复制时间,但我在文档中找不到任何指定cudaMemcpy()
是线性还是恒定时间操作的内容。
答案 0 :(得分:1)
同步内存传输不是固定时间,而是具有固定延迟组件和与传输大小成比例的组件。在小尺寸下,延迟占主导地位,在大尺寸下,限制传输速度受到内存或总线带宽的限制。
考虑以下琐碎的基准:
#include <iostream>
#include <string>
#include <algorithm>
__global__ void memsetkernel(int *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(; tid < n; tid += stride) {
x[tid] = threadIdx.x;
}
}
int main(int argc, char* argv[])
{
// size
int n = 100;
int nreps = 10;
if (argc > 1) {
n = std::stoi(std::string(argv[1]));
}
size_t sz = sizeof(int) * size_t(n);
// host array
int* host = new int[n];
// allocate size ints on device
int* device;
cudaMalloc(&device, sz);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
{
int nthreads = 1024;
int nblocks = std::max(1, std::min(13*2, n / nthreads));
memsetkernel<<<nblocks, nthreads>>>(device, n);
cudaDeviceSynchronize();
cudaEventRecord(start);
for(int i=0; i<nreps; i++) {
memsetkernel<<<nblocks, nthreads>>>(device, n);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds, kilobytes, bandwidth;
cudaEventElapsedTime(&milliseconds, start, stop);
milliseconds /= float(nreps); // Average of nreps
kilobytes = float(sz) / 1e3f;
bandwidth = kilobytes / milliseconds;
std::cout << "kernel assignment: " << bandwidth << " Mb/s" << std::endl;
}
{
cudaMemcpy(host, device, sz, cudaMemcpyDeviceToHost);
cudaEventRecord(start);
for(int i=0; i<nreps; i++) {
cudaMemcpy(host, device, sz, cudaMemcpyDeviceToHost);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds, kilobytes, bandwidth;
cudaEventElapsedTime(&milliseconds, start, stop);
milliseconds /= float(nreps); // Average of nreps
kilobytes = float(sz) / 1e3f;
bandwidth = kilobytes / milliseconds;
std::cout << "DTOH: " << bandwidth << " Mb/s" << std::endl;
}
{
cudaMemcpy(device, host, sz, cudaMemcpyHostToDevice);
cudaEventRecord(start);
for(int i=0; i<nreps; i++) {
cudaMemcpy(device, host, sz, cudaMemcpyHostToDevice);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds, kilobytes, bandwidth;
cudaEventElapsedTime(&milliseconds, start, stop);
milliseconds /= float(nreps); // Average of nreps
kilobytes = float(sz) / 1e3f;
bandwidth = kilobytes / milliseconds;
std::cout << "HTOD: " << bandwidth << " Mb/s" << std::endl;
}
// reset device
cudaDeviceReset();
}
以不同的数据大小运行此操作会显示以下行为:
设备到主机和主机到设备都渐近接近所讨论机器的PCI-e总线带宽的60%左右(约6.5 Gb / s,使用固定主机可以达到更高的值)内存),内核达到GPU主存储器带宽的70%左右(150 Gb / s,理论最大带宽约为224Gb / s)。
NVIDIA提供了一个用于测量传输带宽的示例,您可以阅读here。您可以使用它来自己探索硬件的性能。