根据文档,cufftSetStream()
函数
将CUDA流与cuFFT计划相关联。在计划执行期间进行的所有内核启动现在都通过关联的流[...直到...]完成,并且通过另一次调用cufftSetStream()来更改流。
不幸的是,结果变成了垃圾。这是一个通过两种方式执行一系列转换来演示这一点的示例:一次是每个流都有自己的专用计划,一次是单个计划被重用,如上面的文档所示。前者的行为与预期一致,reused / cufftSetStream方法在大多数变换中都有错误。这是在我在CentOS 7 linux上试过的两张卡(GTX 750 ti,Titan X)上观察到的 Cuda编译工具,7.0版,V7.0.27;并发布7.5,V7.5.17。
编辑:参见" FIX"下面的评论是解决问题的一种方法。
#include <cufft.h>
#include <stdexcept>
#include <iostream>
#include <numeric>
#include <vector>
#define ck(cmd) if ( cmd) { std::cerr << "error at line " << __LINE__ << std::endl;exit(1);}
__global__
void fill_input(cufftComplex * buf, int batch,int nbins,int stride,int seed)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y)
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nbins;j += gridDim.x*blockDim.x)
buf[i*stride + j] = make_cuFloatComplex( (i+seed)%101 - 50,(j+seed)%41-20);
}
__global__
void check_output(const float * buf1,const float * buf2,int batch, int nfft, int stride, int * errors)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y) {
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nfft;j += gridDim.x*blockDim.x) {
float e=buf1[i*stride+j] - buf2[i*stride+j];
if (e*e > 1) // gross error
atomicAdd(errors,1);
}
}
}
void demo(bool reuse_plan)
{
if (reuse_plan)
std::cout << "Reusing the same fft plan with multiple stream via cufftSetStream ... ";
else
std::cout << "Giving each stream its own dedicated fft plan ... ";
int nfft = 1024;
int batch = 1024;
int nstreams = 8;
int nbins = nfft/2+1;
int nit=100;
size_t inpitch,outpitch;
std::vector<cufftComplex*> inbufs(nstreams);
std::vector<float*> outbufs(nstreams);
std::vector<float*> checkbufs(nstreams);
std::vector<cudaStream_t> streams(nstreams);
std::vector<cufftHandle> plans(nstreams);
for (int i=0;i<nstreams;++i) {
ck( cudaStreamCreate(&streams[i]));
ck( cudaMallocPitch((void**)&inbufs[i],&inpitch,nbins*sizeof(cufftComplex),batch) );
ck( cudaMallocPitch((void**)&outbufs[i],&outpitch,nfft*sizeof(float),batch));
ck( cudaMallocPitch((void**)&checkbufs[i],&outpitch,nfft*sizeof(float),batch) );
if (i==0 || reuse_plan==false)
ck ( cufftPlanMany(&plans[i],1,&nfft,&nbins,1,inpitch/sizeof(cufftComplex),&nfft,1,outpitch/sizeof(float),CUFFT_C2R,batch) );
}
// fill the input buffers and FFT them to get a baseline for comparison
for (int i=0;i<nstreams;++i) {
fill_input<<<20,dim3(32,32)>>>(inbufs[i],batch,nbins,inpitch/sizeof(cufftComplex),i);
ck (cudaGetLastError());
if (reuse_plan) {
ck (cufftExecC2R(plans[0],inbufs[i],checkbufs[i]));
}else{
ck (cufftExecC2R(plans[i],inbufs[i],checkbufs[i]));
ck( cufftSetStream(plans[i],streams[i]) ); // only need to set the stream once
}
ck( cudaDeviceSynchronize());
}
// allocate a buffer for the error count
int * errors;
cudaMallocHost((void**)&errors,sizeof(int)*nit);
memset(errors,0,sizeof(int)*nit);
/* FIX: an event can protect the plan internal buffers
by serializing access to the plan
cudaEvent_t ev;
cudaEventCreateWithFlags(&ev,cudaEventDisableTiming);
*/
// perform the FFTs and check the outputs on streams
for (int it=0;it<nit;++it) {
int k = it % nstreams;
ck( cudaStreamSynchronize(streams[k]) ); // make sure any prior kernels have completed
if (reuse_plan) {
// FIX: ck(cudaStreamWaitEvent(streams[k],ev,0 ) );
ck(cufftSetStream(plans[0],streams[k]));
ck(cufftExecC2R(plans[0],inbufs[k],outbufs[k]));
// FIX: ck(cudaEventRecord(ev,streams[k] ) );
}else{
ck(cufftExecC2R(plans[k],inbufs[k],outbufs[k]));
}
check_output<<<100,dim3(32,32),0,streams[k]>>>(outbufs[k],checkbufs[k],batch,nfft,outpitch/sizeof(float),&errors[it]);
ck (cudaGetLastError());
}
ck(cudaDeviceSynchronize());
// report number of errors
int errcount=0;
for (int it=0;it<nit;++it)
if (errors[it])
++errcount;
std::cout << errcount << " of " << nit << " transforms had errors\n";
for (int i=0;i<nstreams;++i) {
cudaFree(inbufs[i]);
cudaFree(outbufs[i]);
cudaStreamDestroy(streams[i]);
if (i==0 || reuse_plan==false)
cufftDestroy(plans[i]);
}
}
int main(int argc,char ** argv)
{
demo(false);
demo(true);
return 0;
}
典型输出
为每个流提供自己专用的fft计划... 100个变换中的0个有错误 通过cufftSetStream重复使用多个流的相同fft计划... 100个转换中的87个有错误
答案 0 :(得分:3)
为了按照您希望的方式重复使用计划,您需要手动管理cuFFT工作区。
每个计划都有一个空间用于中间计算结果。如果您想同时使用计划句柄 来执行两个或更多不同的计划执行,则需要为每个并发的cufftExec *调用提供临时缓冲区。
您可以使用cufftSetWorkArea执行此操作 - 请参阅cuFFT文档中的第3.7节。第2.2节也有助于理解它是如何工作的。
这是一个有效的示例,显示了对此代码的更改:
.demo-container {
padding: 10px;
background-color: orange;
border: 1px solid #000000;
box-sizing: border-box;
box-shadow: 0 3px 10px rgba(0,0,0,0.15);
-o-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
-ms-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
-moz-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
-webkit-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
}