cufftSetStream导致垃圾输出。难道我做错了什么?

时间:2016-08-19 04:07:57

标签: cuda cufft

根据文档,cufftSetStream()函数

  

将CUDA流与cuFFT计划相关联。在计划执行期间进行的所有内核启动现在都通过关联的流[...直到...]完成,并且通过另一次调用cufftSetStream()来更改流。

不幸的是,结果变成了垃圾。这是一个通过两种方式执行一系列转换来演示这一点的示例:一次是每个流都有自己的专用计划,一次是单个计划被重用,如上面的文档所示。前者的行为与预期一致,reused / cufftSetStream方法在大多数变换中都有错误。这是在我在CentOS 7 linux上试过的两张卡(GTX 750 ti,Titan X)上观察到的 Cuda编译工具,7.0版,V7.0.27;并发布7.5,V7.5.17。

编辑:参见" FIX"下面的评论是解决问题的一种方法。

#include <cufft.h>
#include <stdexcept>
#include <iostream>
#include <numeric>
#include <vector>

#define ck(cmd) if ( cmd) { std::cerr << "error at line " << __LINE__ << std::endl;exit(1);}


__global__
void fill_input(cufftComplex * buf, int batch,int nbins,int stride,int seed)
{
    for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y)
        for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nbins;j += gridDim.x*blockDim.x)
            buf[i*stride + j] = make_cuFloatComplex( (i+seed)%101 - 50,(j+seed)%41-20);
}

__global__
void check_output(const float * buf1,const float * buf2,int batch, int nfft, int stride, int * errors)
{
    for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y) {
        for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nfft;j += gridDim.x*blockDim.x) {
            float e=buf1[i*stride+j] - buf2[i*stride+j];
            if (e*e > 1) // gross error
                atomicAdd(errors,1);
        }
    }
}

void demo(bool reuse_plan)
{
    if (reuse_plan)
        std::cout << "Reusing the same fft plan with multiple stream via cufftSetStream ... ";
    else
        std::cout << "Giving each stream its own dedicated fft plan ... ";
    int nfft = 1024;
    int batch = 1024;
    int nstreams = 8;
    int nbins = nfft/2+1;
    int nit=100;
    size_t inpitch,outpitch;

    std::vector<cufftComplex*> inbufs(nstreams);
    std::vector<float*> outbufs(nstreams);
    std::vector<float*> checkbufs(nstreams);
    std::vector<cudaStream_t> streams(nstreams);
    std::vector<cufftHandle> plans(nstreams);
    for (int i=0;i<nstreams;++i) {
        ck( cudaStreamCreate(&streams[i]));
        ck( cudaMallocPitch((void**)&inbufs[i],&inpitch,nbins*sizeof(cufftComplex),batch) );
        ck( cudaMallocPitch((void**)&outbufs[i],&outpitch,nfft*sizeof(float),batch));
        ck( cudaMallocPitch((void**)&checkbufs[i],&outpitch,nfft*sizeof(float),batch) );
        if (i==0 || reuse_plan==false)
            ck ( cufftPlanMany(&plans[i],1,&nfft,&nbins,1,inpitch/sizeof(cufftComplex),&nfft,1,outpitch/sizeof(float),CUFFT_C2R,batch) );
    }

    // fill the input buffers and FFT them to get a baseline for comparison
    for (int i=0;i<nstreams;++i) {
        fill_input<<<20,dim3(32,32)>>>(inbufs[i],batch,nbins,inpitch/sizeof(cufftComplex),i);
        ck (cudaGetLastError());
        if (reuse_plan) {
            ck (cufftExecC2R(plans[0],inbufs[i],checkbufs[i]));
        }else{
            ck (cufftExecC2R(plans[i],inbufs[i],checkbufs[i]));
            ck( cufftSetStream(plans[i],streams[i]) ); // only need to set the stream once
        }
        ck( cudaDeviceSynchronize());
    }
    // allocate a buffer for the error count
    int * errors;
    cudaMallocHost((void**)&errors,sizeof(int)*nit);
    memset(errors,0,sizeof(int)*nit);

    /* FIX: an event can protect the plan internal buffers 
    by serializing access to the plan
    cudaEvent_t ev;
    cudaEventCreateWithFlags(&ev,cudaEventDisableTiming);
    */

    // perform the FFTs and check the outputs on streams
    for (int it=0;it<nit;++it) {
        int k = it % nstreams;
        ck( cudaStreamSynchronize(streams[k]) ); // make sure any prior kernels have completed
        if (reuse_plan) {
            // FIX: ck(cudaStreamWaitEvent(streams[k],ev,0 ) );
            ck(cufftSetStream(plans[0],streams[k]));
            ck(cufftExecC2R(plans[0],inbufs[k],outbufs[k]));
            // FIX: ck(cudaEventRecord(ev,streams[k] ) );
        }else{
            ck(cufftExecC2R(plans[k],inbufs[k],outbufs[k]));
        }
        check_output<<<100,dim3(32,32),0,streams[k]>>>(outbufs[k],checkbufs[k],batch,nfft,outpitch/sizeof(float),&errors[it]);
        ck (cudaGetLastError());
    }
    ck(cudaDeviceSynchronize());

    // report number of errors
    int errcount=0;
    for (int it=0;it<nit;++it)
        if (errors[it])
            ++errcount;
    std::cout << errcount << " of " << nit << " transforms had errors\n";

    for (int i=0;i<nstreams;++i) {
        cudaFree(inbufs[i]);
        cudaFree(outbufs[i]);
        cudaStreamDestroy(streams[i]);
        if (i==0 || reuse_plan==false)
            cufftDestroy(plans[i]);
    }
}

int main(int argc,char ** argv)
{
    demo(false);
    demo(true);
    return 0;
}

典型输出

  

为每个流提供自己专用的fft计划... 100个变换中的0个有错误   通过cufftSetStream重复使用多个流的相同fft计划... 100个转换中的87个有错误

1 个答案:

答案 0 :(得分:3)

为了按照您希望的方式重复使用计划,您需要手动管理cuFFT工作区。

每个计划都有一个空间用于中间计算结果。如果您想同时使用计划句柄 来执行两个或更多不同的计划执行,则需要为每个并发的cufftExec *调用提供临时缓冲区。

您可以使用cufftSetWorkArea执行此操作 - 请参阅cuFFT文档中的第3.7节。第2.2节也有助于理解它是如何工作的。

这是一个有效的示例,显示了对此代码的更改:

.demo-container {
  padding: 10px;
  background-color: orange;
  border: 1px solid #000000;
  box-sizing: border-box;
    box-shadow: 0 3px 10px rgba(0,0,0,0.15);
    -o-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
    -ms-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
    -moz-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
    -webkit-box-shadow: 0 3px 10px rgba(0,0,0,0.1);
}