使用事件与窗口的CUDA传输时间

时间:2014-06-16 22:39:38

标签: cuda memcpy

我传输48kb数据块(带固定内存),虽然cuda事件看到它以5gb /秒的速度上升,但当我们回到窗口时,我们只能看到一半的速度。这只是不可避免的驱动程序开销,还是有办法缓解这种情况?我已将该过程封装在下面的测试程序中。

void transferUp(size_t size)
{

StopWatchWin timer;
timer.start();

float tUpCopyStart,tUpCopyStop;

cudaEvent_t sendUpStopEvent,sendUpStartEvent;
checkCudaErrors(cudaEventCreate( &sendUpStartEvent ));
checkCudaErrors(cudaEventCreate( &sendUpStopEvent ));

unsigned *cpu_sending = (unsigned *)malloc(size);
checkCudaErrors(cudaHostAlloc(&cpu_sending, size*sizeof(unsigned), cudaHostAllocPortable));

unsigned *gpu_receiving;
checkCudaErrors(cudaMalloc(&gpu_receiving, size*sizeof(unsigned)));

tUpCopyStart = timer.getTime();
checkCudaErrors(cudaEventRecord(sendUpStartEvent));

checkCudaErrors(cudaMemcpyAsync(gpu_receiving, cpu_sending, size*sizeof(unsigned), cudaMemcpyHostToDevice));

checkCudaErrors(cudaEventRecord(sendUpStopEvent));
checkCudaErrors(cudaEventSynchronize(sendUpStopEvent));
tUpCopyStop = timer.getTime();

double sendTimeWindows = tUpCopyStop - tUpCopyStart;

float sendTimeCuda;
checkCudaErrors(cudaEventElapsedTime( &sendTimeCuda,sendUpStartEvent,sendUpStopEvent));

float GbSec_cuda = (size*sizeof(unsigned)/1000)/(sendTimeCuda*1000);
float GbSec_win = (size*sizeof(unsigned)/1000)/(sendTimeWindows*1000);

printf("size=%06d bytes eventTime=%.03fms windowsTime=%0.3fms cudaSpeed=%.01f gb/s winSpeed=%.01f gb/s\n",
size*sizeof(unsigned),sendTimeCuda,sendTimeWindows,GbSec_cuda,GbSec_win);

checkCudaErrors(cudaEventDestroy( sendUpStartEvent ));
checkCudaErrors(cudaEventDestroy( sendUpStopEvent ));

checkCudaErrors(cudaFreeHost(cpu_sending));
checkCudaErrors(cudaFree(gpu_receiving));

} 

1 个答案:

答案 0 :(得分:1)

对这个小操作进行计时的开销是压倒性的。

对于小型主机 - >设备副本(例如,64K或更小),CUDA驱动程序会将数据内联到命令缓冲区中,因此即使是声称同步的memcpy调用实际上也是异步完成的。但是,代码中的cudaEventSynchronize()调用会强制CPU等待而不是继续执行。