可以并发cudaMemcpyAsync吗?

时间:2019-04-02 06:06:00

标签: cuda gpu nvidia

我正在编写一些测试代码以熟悉cudaMemcpyAsync的并发属性。

当我尝试在单个上下文中执行并发cudaMemcpyAsync时,复制操作正在排队并以吞吐量12.4 GB/s逐一执行。答案为here

但是,当我尝试在不同上下文中进行并发cudaMemcpyAsync时(通过将它们分为4个进程),似乎第一个和最后一个正在同时运行:{{3 }}

前2个连续的cudaMemcpyAsync以吞吐量12.4 GB/s运行,而后2个并发的5.3 GB/s以吞吐量运行。

如何在单个上下文中并发cudaMemcpyAsync

我正在CUDA9.0上使用TITAN Xp,它有2个复制引擎。

编辑

方案1的代码

#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

inline
cudaError_t checkCuda(cudaError_t result)
{
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
  return result;
}

const int nStreams = 8;
const int N = 100000000;
const int bytes = N * sizeof(int);
int* arr_H;
int* arr_D[nStreams];
cudaStream_t stream[nStreams];
int args[nStreams];
pthread_t threads[nStreams];

void* worker(void *arg)
{
  int i = *((int *)arg);
  checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));

  return NULL;
}

int main()
{
  for(int i = 0; i < nStreams; i++)
    checkCuda(cudaStreamCreate(&stream[i]));

  checkCuda(cudaMallocHost((void**)&arr_H, bytes));
  for (int i = 0; i < N; i++)
    arr_H[i] = random();

  for (int i = 0; i < nStreams; i++)
    checkCuda(cudaMalloc((void**)&arr_D[i], bytes));

  for (int i = 0; i < nStreams; i++) {
    args[i] = i;
    pthread_create(&threads[i], NULL, worker, &args[i]);
  }

  for (int i = 0; i < nStreams; i++)
    pthread_join(threads[i], NULL);

  cudaFreeHost(arr_H);
  for (int i = 0; i < nStreams; i++) {
    checkCuda(cudaStreamDestroy(stream[i]));
    cudaFree(arr_D[i]);
  }

  return 0;

方案2的代码

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

inline
cudaError_t checkCuda(cudaError_t result)
{
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
  return result;
}

int main()
{
  const int nStreams = 1;
  const int N = 100000000;
  const int bytes = N * sizeof(int);
  int* arr_H;
  int* arr_D[nStreams];
  cudaStream_t stream[nStreams];

  for(int i = 0; i < nStreams; i++)
    checkCuda(cudaStreamCreate(&stream[i]));

  checkCuda(cudaMallocHost((void**)&arr_H, bytes));
  for (int i = 0; i < N; i++)
    arr_H[i] = random();

  for (int i = 0; i < nStreams; i++)
    checkCuda(cudaMalloc((void**)&arr_D[i], bytes));

  for (int i = 0; i < nStreams; i++)
    checkCuda(cudaMemcpyAsync(arr_D[i], arr_H, bytes, cudaMemcpyHostToDevice, stream[i]));

  cudaFreeHost(arr_H);
  for (int i = 0; i < nStreams; i++) {
    checkCuda(cudaStreamDestroy(stream[i]));
    cudaFree(arr_D[i]);
  }

  return 0;
}

代码2基本上是从代码1复制而来的。我使用python脚本同时运行多个进程:

#!/usr/bin/env python3
import subprocess

N = 4

processes = [subprocess.Popen('./a.out', shell=True) for _ in range(N)]

for process in processes:
    process.wait()

0 个答案:

没有答案