Question

我听说过/如果我们有一些 n FFT来执行某些 m ，我们可以使用cuFFT的批处理模式em> 每个向量。所以为了测试它，我制作了一个示例程序并运行它。我使用的数据是一个文件，其中1024个浮点数与重复1024次的10个数相同。虽然我应该得到1024点FFT的相同结果，但我没有得到。如果我在某个地方概念上错了，请纠正我，如果你能纠正我所犯的错误，请在下面填写代码。

注意：我只使用1D FFT。

以下是代码段：

#include <cuda.h> #include <cufft.h> #include <stdio.h> #include <math.h> #define NX 1024 #define DATASIZE 1024 #define BATCH 10 int main (int argc, char* argv[]) { cufftHandle plan; cufftComplex *deviceOutputData, *hostOutputData; cufftReal *hostInputData, *deviceInputData; int i,j; FILE *in; // *out, *fp; cudaMalloc ((void**)&deviceInputData, NX*BATCH*sizeof(cufftReal)); hostInputData = (cufftReal*) malloc (NX*BATCH*sizeof(cufftReal)); cudaMalloc ((void**)&deviceOutputData, NX*BATCH*sizeof(cufftComplex)); hostOutputData = (cufftComplex*) malloc (NX*BATCH*sizeof(cufftComplex)); in = fopen ("InFile.txt", "r"); if (in==NULL) { fprintf (stderr, "Input file has some issues. Please check."); exit(1);} float data; //Allocate data for (i=0; i<BATCH; i++){ for (j=0; j<DATASIZE;j++) { fscanf(in, "%f", &data); hostInputData [j + i*DATASIZE] = data; } } fclose (in); cudaMemcpy (deviceInputData, hostInputData, DATASIZE*BATCH*sizeof(cufftReal), cudaMemcpyHostToDevice); cufftPlan1d (&plan, NX, CUFFT_R2C, BATCH); cufftExecR2C (plan, deviceInputData, deviceOutputData); cudaThreadSynchronize(); cudaMemcpy (hostOutputData, deviceOutputData, DATASIZE*BATCH*sizeof(cufftComplex), cudaMemcpyDeviceToHost); cufftDestroy (plan); cudaFree (deviceOutputData); cudaFree (deviceInputData); #define a hostOutputData[j+i*NX].x #define b hostOutputData[j+i*NX].y float result[NX]; for (i=0; i<BATCH; i++){ printf ("\n*New Batch*\n"); for (j=0; j<=NX/2;j++){ result[j] = sqrt ((a*a)+(b*b)); printf ("%f\n", result[j]); } for (j=1; j<NX/2; j++){ result[j+(NX/2)] = result [(NX/2)-j]; printf ("%f\n", result[j+(NX/2)]); } }

Answer 1

如Robert Crovella所述，并且如cuFFT用户指南 - CUDA 6.5中所述，

cufftPlan1d()的批量大小不是1，已弃用。使用cufftPlanMany()进行多次批量执行。

下面，我将报告一个完整的示例，更正您的代码并使用cufftPlanMany()代替cufftPlan1d()。如您所见，

int rank = 1;                           // --- 1D FFTs
int n[] = { DATASIZE };                 // --- Size of the Fourier transform
int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH;                      // --- Number of batched executions
cufftPlanMany(&handle, rank, n, 
              inembed, istride, idist,
              onembed, ostride, odist, CUFFT_R2C, batch);

完全等同于“老式”

cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);

请注意，您的示例并未考虑cufftReal长度为DATASIZE的数组的1D FFT是cufftComplex DATASIZE/2 + 1元素数组。< / p>

以下是完整示例：

#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>

#define DATASIZE 8
#define BATCH 2

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/********/
/* MAIN */
/********/
int main ()
{
    // --- Host side input data allocation and initialization
    cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
    for (int i=0; i<BATCH; i++)
        for (int j=0; j<DATASIZE; j++) hostInputData[i*DATASIZE + j] = (cufftReal)(i + 1);

    // --- Device side input data allocation and initialization
    cufftReal *deviceInputData; gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
    cudaMemcpy(deviceInputData, hostInputData, DATASIZE * BATCH * sizeof(cufftReal), cudaMemcpyHostToDevice);

    // --- Host side output data allocation
    cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));

    // --- Device side output data allocation
    cufftComplex *deviceOutputData; gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));

    // --- Batched 1D FFTs
    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { DATASIZE };                 // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = BATCH;                      // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_R2C, batch);

    //cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
    cufftExecR2C(handle,  deviceInputData, deviceOutputData);

    // --- Device->Host copy of the results
    gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

    for (int i=0; i<BATCH; i++)
        for (int j=0; j<(DATASIZE / 2 + 1); j++)
            printf("%i %i %f %f\n", i, j, hostOutputData[i*(DATASIZE / 2 + 1) + j].x, hostOutputData[i*(DATASIZE / 2 + 1) + j].y);

    cufftDestroy(handle);
    gpuErrchk(cudaFree(deviceOutputData));
    gpuErrchk(cudaFree(deviceInputData));

}

请根据CUFFT error handling添加您自己的cuFFT错误检查。

实数阵列的1D批量FFT

1 个答案: