重新编辑:
我计划使用cudaMallocManaged
和cudaStreamAttachMemAsync
来实现计算/副本重叠。步骤之一是为cudaMallocManaged
分配的内存分配值。我以这个answer为例。
在这个答案中,由memset
初始化的数据的内存,我需要将现有数据复制到由cudaMallocManaged
分配的内存。因此,我对示例代码进行了一些修改,然后使用GDB进行了调试,发现了一些错误:
(gdb) n
Fatal error: cudaStreamAttach fail (invalid argument at cmm.cu:58)
*** FAILED - ABORTING
[Thread 0x7fffeec0d700 (LWP 14751) exited]
[Thread 0x7fffef40e700 (LWP 14750) exited]
[Inferior 1 (process 14742) exited with code 01]
下面是MSVE:
#include <stdio.h>
#include <time.h>
#define DSIZE 1048576
#define DWAIT 100000ULL
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int mytype;
__global__ void mykernel(mytype *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < DSIZE) data[idx] = 1;
unsigned long long int tstart = clock64();
while (clock64() < tstart + DWAIT);
}
int main(){
mytype *data1, *data2, *data3;
mytype *exist_data1, *exist_data2, *exist_data3;
exist_data1 = (mytype*)malloc(DSIZE*sizeof(mytype));
exist_data2 = (mytype*)malloc(DSIZE*sizeof(mytype));
exist_data3 = (mytype*)malloc(DSIZE*sizeof(mytype));
memset(exist_data1, 0, DSIZE*sizeof(mytype));
memset(exist_data2, 0, DSIZE*sizeof(mytype));
memset(exist_data3, 0, DSIZE*sizeof(mytype));
cudaStream_t stream1, stream2, stream3;
cudaMallocManaged(&data1, DSIZE*sizeof(mytype)/*, cudaMemAttachHost*/);
cudaMallocManaged(&data2, DSIZE*sizeof(mytype)/*, cudaMemAttachHost*/);
cudaMallocManaged(&data3, DSIZE*sizeof(mytype)/*, cudaMemAttachHost*/);
cudaCheckErrors("cudaMallocManaged fail");
//is it ok to do this kind of memcpy?
memcpy(data1,exist_data1,DSIZE*sizeof(mytype));
memcpy(data2,exist_data2,DSIZE*sizeof(mytype));
memcpy(data3,exist_data3,DSIZE*sizeof(mytype));
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaCheckErrors("cudaStreamCreate fail");
cudaStreamAttachMemAsync(stream1, data1/*, cudaMemAttachGlobal*/);
cudaStreamAttachMemAsync(stream2, data2/*, cudaMemAttachGlobal*/);
cudaStreamAttachMemAsync(stream3, data3/*, cudaMemAttachGlobal*/);
cudaDeviceSynchronize();
cudaCheckErrors("cudaStreamAttach fail"); //this error checking gives error message
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream1>>>(data1);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream2>>>(data2);
mykernel<<<(DSIZE+nTPB-1)/nTPB, nTPB, 0, stream3>>>(data3);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
for (int i = 0; i < DSIZE; i++){
if (data1[i] != 1) {printf("data1 mismatch at %d, should be: %d, was: %d\n", i, 1, data1[i]); return 1;}
if (data2[i] != 1) {printf("data2 mismatch at %d, should be: %d, was: %d\n", i, 1, data2[i]); return 1;}
if (data3[i] != 1) {printf("data3 mismatch at %d, should be: %d, was: %d\n", i, 1, data3[i]); return 1;}
}
printf("Success!\n");
return 0;
}
当我省略第三个参数时,它可以正常运行,形成 NVIDIA's API 它说:
默认情况下,计算能力低于6.x的设备分配 直接在GPU上管理内存。但是,计算设备 功能6.x和更高版本在以下情况下不分配物理内存 调用cudaMallocManaged():在这种情况下,将填充物理内存 第一次触摸时,可能驻留在CPU或GPU上。
我的GPU是Tesla P100,但是当我指定第三个参数时为什么会出错? 这是否意味着只要计算能力大于6.x,就无需指定第三个参数?