我有一些用于纹理对象分配和“主机到设备”复制的代码。这只是对答案here的修改。我没有明确使用流,只是cudaSetDevice()
此代码工作正常,但是,当我运行Visual Profiler时,我看到从主机到阵列的内存副本不是异步的。它们分别分配给自己的设备流,但是直到第一个完成(在2个GPU上运行)后,第二个才开始。我已经尝试过使用大图像,因此我确定它不会占用CPU的开销。
我的猜测是代码中需要同步才能停止CPU,但是我不知道是什么。如何使此循环异步?
MCVE:
void CreateTexture(int num_devices,float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage);
int main(void)
{
int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);
int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image=(float*)malloc(nVoxelX*nVoxelY*nVoxelZ*sizeof(float));
cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
}
实际功能:
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
//size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//cudaArray Descriptor
const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
//cudaCheckErrors("Texture memory allocation fail");
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_cuArrTex[i];
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3DAsync(©Params);
//cudaCheckErrors("Texture memory data copy fail");
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArrTex[i];
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeBorder;
texDescr.addressMode[1] = cudaAddressModeBorder;
texDescr.addressMode[2] = cudaAddressModeBorder;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
//cudaCheckErrors("Texture object creation fail");
}
}
答案 0 :(得分:1)
我在代码中看到的两个主要问题是:
您的主机分配是可分页的分配。目标之一是主机内存的CUDA中的复制操作异步要求对主机内存进行固定分配。
您的创建纹理循环中还有其他同步操作。根据我的经验,设备分配操作(在这种情况下为cudaMalloc3DArray
)正在同步。我没有运行测试来确定cudaCreateTextureObject
是否正在同步,但是如果同步了,我不会感到惊讶。因此,我一般对异步的建议是使同步操作脱离循环。
对于您而言,我们可以按以下方式重构您的代码,从nvprof
的角度来看,这似乎允许操作重叠:
$ cat t399.cu
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
//size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//cudaArray Descriptor
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
//cudaCheckErrors("Texture memory allocation fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_cuArrTex[i];
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3DAsync(©Params);
//cudaCheckErrors("Texture memory data copy fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArrTex[i];
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeBorder;
texDescr.addressMode[1] = cudaAddressModeBorder;
texDescr.addressMode[2] = cudaAddressModeBorder;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
//cudaCheckErrors("Texture object creation fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();
}
}
int main(void)
{
int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);
int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image;
cudaHostAlloc(&image, nVoxelX*nVoxelY*nVoxelZ*sizeof(float), cudaHostAllocDefault);
cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
}
$ nvcc -o t399 t399.cu
$ cuda-memcheck ./t399
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof --print-gpu-trace ./t399
==19953== NVPROF is profiling process 19953, command: ./t399
==19953== Profiling application: ./t399
==19953== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
1.55311s 90.735ms - - - - - 512.00MB 5.5106GB/s Pinned Array Tesla P100-PCIE 1 7 [CUDA memcpy HtoA]
1.55316s 90.640ms - - - - - 512.00MB 5.5163GB/s Pinned Array Tesla K40m (1) 2 18 [CUDA memcpy HtoA]
1.55318s 85.962ms - - - - - 512.00MB 5.8165GB/s Pinned Array Tesla K20Xm (2) 3 29 [CUDA memcpy HtoA]
1.55320s 89.908ms - - - - - 512.00MB 5.5612GB/s Pinned Array Tesla K20Xm (3) 4 40 [CUDA memcpy HtoA]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$
我的系统是一个4-GPU系统,两个根端口分别挂有两个GPU。因此,从探查器的角度来看,PCIE Gen3上主机->设备固定的传输带宽大约为10GB / s,这是在每个端口的2个GPU之间分配的,但是仔细研究探查器的开始和传输的持续时间会发现所有这4个都重叠了从探查器的角度来看。