我有一些图像切片作为CUDA内核系列的输入。在该执行链中,一步的输出用作其他步骤的输入,而不复制主机存储器中的中间输出。
cudaKernel1(inputImage, out1, stream);
cudaKernel2(out1, out2, stream);
cudaKernel3(out2, out3, stream);
....
cudaKernelN(..., ..., stream);
但是对于某种情况,我必须在执行链中包含if/else
条件,我必须将结果复制回主机内存。
cudaKernel1(inputImage, output1, stream);
cudaKernel2(out1, out2, stream);
cudaKernel3(out2, out3, stream);
....
cudaKernel11(out10, out11,stream);
copyDtoHAsync(temp,out11, stream);
cuStreamSynchronize(stream);
if(SOME_CONDITION_ON_temp)
{
cudaKernel12(out11, out12, stream);
cudaKernel13(out12, out13, stream);
cudaKernel14(out13, out14, stream);
.........
}
在上面的方案中,copyDtoHAsync
,cuStreamSynchronize
和if
调用是流阻塞调用。
假设我有100个输入切片作为输入并同时在多个GPU流上执行。如果条件为真,则对于40个瓦片,对于剩余的60,则为假。管理此类中间阻止呼叫的最佳方法是什么?如何确保在GPU上不间断地执行这40个磁贴而不会因阻塞呼叫而被阻止?
任何帖子,类似问题,相关的例子将不胜感激。
答案 0 :(得分:2)
您可以尝试使用openMP。像
这样的东西cudaStream_t streams[num_tiles];
#pragma omp parallel for
for(int i=0; i<num_tiles; i++)
{
cudaStreamCreate( &streams[i]);
cudaKernel1(inputImage, output1, stream[i]);
cudaKernel2(out1, out2, stream[i]);
cudaKernel3(out2, out3, stream[i]);
...
cudaKernel11(out10, out11,stream[i]);
copyDtoHAsync(temp,out11, stream[i]);
cuStreamSynchronize(stream[i]);
if(SOME_CONDITION_ON_temp)
{
cudaKernel12(out11, out12, stream[i]);
cudaKernel13(out12, out13, stream[i]);
cudaKernel14(out13, out14, stream[i]);
.........
}
cudaStreamDestroy(stream[i]);
}