我正在编写一些测试代码以熟悉cudaMemcpyAsync
的并发属性。
当我尝试在单个上下文中执行并发cudaMemcpyAsync
时,复制操作正在排队并以吞吐量12.4 GB/s
逐一执行。答案为here:
但是,当我尝试在不同上下文中进行并发cudaMemcpyAsync
时(通过将它们分为4个进程),似乎第一个和最后一个正在同时运行:{{3 }}
前2个连续的cudaMemcpyAsync
以吞吐量12.4 GB/s
运行,而后2个并发的5.3 GB/s
以吞吐量运行。
如何在单个上下文中并发cudaMemcpyAsync
?
我正在CUDA9.0
上使用TITAN Xp
,它有2个复制引擎。
编辑:
方案1的代码
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
const int nStreams = 8;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
int args[nStreams];
pthread_t threads[nStreams];
void* worker(void *arg)
{
int i = *((int *)arg);
checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));
return NULL;
}
int main()
{
for(int i = 0; i < nStreams; i++)
checkCuda(cudaStreamCreate(&stream[i]));
checkCuda(cudaMallocHost((void**)&arr_H, bytes));
for (int i = 0; i < N; i++)
arr_H[i] = random();
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMalloc((void**)&arr_D[i], bytes));
for (int i = 0; i < nStreams; i++) {
args[i] = i;
pthread_create(&threads[i], NULL, worker, &args[i]);
}
for (int i = 0; i < nStreams; i++)
pthread_join(threads[i], NULL);
cudaFreeHost(arr_H);
for (int i = 0; i < nStreams; i++) {
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(arr_D[i]);
}
return 0;
方案2的代码
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
int main()
{
const int nStreams = 1;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
for(int i = 0; i < nStreams; i++)
checkCuda(cudaStreamCreate(&stream[i]));
checkCuda(cudaMallocHost((void**)&arr_H, bytes));
for (int i = 0; i < N; i++)
arr_H[i] = random();
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMalloc((void**)&arr_D[i], bytes));
for (int i = 0; i < nStreams; i++)
checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));
cudaFreeHost(arr_H);
for (int i = 0; i < nStreams; i++) {
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(arr_D[i]);
}
return 0;
}
代码2基本上是从代码1复制而来的。我使用python脚本同时运行多个进程:
#!/usr/bin/env python3
import subprocess
N = 4
processes = [subprocess.Popen('./a.out', shell=True) for _ in range(N)]
for process in processes:
process.wait()