如何使用cudaMallocManaged和cudaStreamAttachMemAsync来实现计算/复制重叠?

时间:2019-04-30 03:51:51

标签: cuda

重新编辑:

我计划使用cudaMallocManagedcudaStreamAttachMemAsync来实现计算/副本重叠。步骤之一是为cudaMallocManaged分配的内存分配值。我以这个answer为例。 在这个答案中,由memset初始化的数据的内存,我需要将现有数据复制到由cudaMallocManaged分配的内存。因此,我对示例代码进行了一些修改,然后使用GDB进行了调试,发现了一些错误:

(gdb) n
Fatal error: cudaStreamAttach fail (invalid argument at cmm.cu:58)
*** FAILED - ABORTING
[Thread 0x7fffeec0d700 (LWP 14751) exited]
[Thread 0x7fffef40e700 (LWP 14750) exited]
[Inferior 1 (process 14742) exited with code 01]

下面是MSVE:

#include <stdio.h>
#include <time.h>
#define DSIZE 1048576
#define DWAIT 100000ULL
#define nTPB 256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

typedef int mytype;

__global__ void mykernel(mytype *data){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < DSIZE) data[idx] = 1;
  unsigned long long int tstart = clock64();
  while (clock64() < tstart + DWAIT);
}

int main(){

  mytype *data1, *data2, *data3;
  mytype *exist_data1, *exist_data2, *exist_data3;
  exist_data1 = (mytype*)malloc(DSIZE*sizeof(mytype));
  exist_data2 = (mytype*)malloc(DSIZE*sizeof(mytype));
  exist_data3 = (mytype*)malloc(DSIZE*sizeof(mytype));
  memset(exist_data1, 0, DSIZE*sizeof(mytype));
  memset(exist_data2, 0, DSIZE*sizeof(mytype));
  memset(exist_data3, 0, DSIZE*sizeof(mytype));

  cudaStream_t stream1, stream2, stream3;
  cudaMallocManaged(&data1, DSIZE*sizeof(mytype)/*, cudaMemAttachHost*/);
  cudaMallocManaged(&data2, DSIZE*sizeof(mytype)/*, cudaMemAttachHost*/);
  cudaMallocManaged(&data3, DSIZE*sizeof(mytype)/*, cudaMemAttachHost*/);
  cudaCheckErrors("cudaMallocManaged fail");

  //is it ok to do this kind of memcpy?
  memcpy(data1,exist_data1,DSIZE*sizeof(mytype));
  memcpy(data2,exist_data2,DSIZE*sizeof(mytype));
  memcpy(data3,exist_data3,DSIZE*sizeof(mytype));

  cudaStreamCreate(&stream1);
  cudaStreamCreate(&stream2);
  cudaStreamCreate(&stream3);
  cudaCheckErrors("cudaStreamCreate fail");
  cudaStreamAttachMemAsync(stream1, data1/*, cudaMemAttachGlobal*/);
  cudaStreamAttachMemAsync(stream2, data2/*, cudaMemAttachGlobal*/);
  cudaStreamAttachMemAsync(stream3, data3/*, cudaMemAttachGlobal*/);
  cudaDeviceSynchronize();
  cudaCheckErrors("cudaStreamAttach fail");  //this error checking gives error message

  mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream1>>>(data1);
  mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream2>>>(data2);
  mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream3>>>(data3);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  for (int i = 0; i < DSIZE; i++){
    if (data1[i] != 1) {printf("data1 mismatch at %d, should be: %d, was: %d\n", i, 1, data1[i]); return 1;}
    if (data2[i] != 1) {printf("data2 mismatch at %d, should be: %d, was: %d\n", i, 1, data2[i]); return 1;}
    if (data3[i] != 1) {printf("data3 mismatch at %d, should be: %d, was: %d\n", i, 1, data3[i]); return 1;}
    }
  printf("Success!\n");
  return 0;
}

当我省略第三个参数时,它可以正常运行,形成 NVIDIA's API 它说:

  

默认情况下,计算能力低于6.x的设备分配   直接在GPU上管理内存。但是,计算设备   功能6.x和更高版本在以下情况下不分配物理内存   调用cudaMallocManaged():在这种情况下,将填充物理内存   第一次触摸时,可能驻留在CPU或GPU上。

我的GPU是Tesla P100,但是当我指定第三个参数时为什么会出错? 这是否意味着只要计算能力大于6.x,就无需指定第三个参数?

0 个答案:

没有答案