我听说过/如果我们有一些 n FFT来执行某些 m ,我们可以使用cuFFT的批处理模式em> 每个向量。所以为了测试它,我制作了一个示例程序并运行它。我使用的数据是一个文件,其中1024
个浮点数与重复1024
次的10
个数相同。虽然我应该得到1024
点FFT的相同结果,但我没有得到。如果我在某个地方概念上错了,请纠正我,如果你能纠正我所犯的错误,请在下面填写代码。
注意:我只使用1D FFT。
以下是代码段:
#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>
#define NX 1024
#define DATASIZE 1024
#define BATCH 10
int main (int argc, char* argv[])
{
cufftHandle plan;
cufftComplex *deviceOutputData, *hostOutputData;
cufftReal *hostInputData, *deviceInputData;
int i,j;
FILE *in; // *out, *fp;
cudaMalloc ((void**)&deviceInputData, NX*BATCH*sizeof(cufftReal));
hostInputData = (cufftReal*) malloc (NX*BATCH*sizeof(cufftReal));
cudaMalloc ((void**)&deviceOutputData, NX*BATCH*sizeof(cufftComplex));
hostOutputData = (cufftComplex*) malloc (NX*BATCH*sizeof(cufftComplex));
in = fopen ("InFile.txt", "r");
if (in==NULL)
{ fprintf (stderr, "Input file has some issues. Please check."); exit(1);}
float data;
//Allocate data
for (i=0; i<BATCH; i++){
for (j=0; j<DATASIZE;j++)
{
fscanf(in, "%f", &data);
hostInputData [j + i*DATASIZE] = data;
}
}
fclose (in);
cudaMemcpy (deviceInputData, hostInputData, DATASIZE*BATCH*sizeof(cufftReal), cudaMemcpyHostToDevice);
cufftPlan1d (&plan, NX, CUFFT_R2C, BATCH);
cufftExecR2C (plan, deviceInputData, deviceOutputData);
cudaThreadSynchronize();
cudaMemcpy (hostOutputData, deviceOutputData, DATASIZE*BATCH*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
cufftDestroy (plan);
cudaFree (deviceOutputData);
cudaFree (deviceInputData);
#define a hostOutputData[j+i*NX].x
#define b hostOutputData[j+i*NX].y
float result[NX];
for (i=0; i<BATCH; i++){
printf ("\n*New Batch*\n");
for (j=0; j<=NX/2;j++){
result[j] = sqrt ((a*a)+(b*b));
printf ("%f\n", result[j]);
}
for (j=1; j<NX/2; j++){
result[j+(NX/2)] = result [(NX/2)-j];
printf ("%f\n", result[j+(NX/2)]);
}
}
答案 0 :(得分:2)
如Robert Crovella所述,并且如cuFFT用户指南 - CUDA 6.5中所述,
cufftPlan1d()
的批量大小不是1,已弃用。使用cufftPlanMany()
进行多次批量执行。
下面,我将报告一个完整的示例,更正您的代码并使用cufftPlanMany()
代替cufftPlan1d()
。如您所见,
int rank = 1; // --- 1D FFTs
int n[] = { DATASIZE }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH; // --- Number of batched executions
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_R2C, batch);
完全等同于“老式”
cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
请注意,您的示例并未考虑cufftReal
长度为DATASIZE
的数组的1D FFT是cufftComplex
DATASIZE/2 + 1
元素数组。< / p>
以下是完整示例:
#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>
#define DATASIZE 8
#define BATCH 2
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main ()
{
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
for (int i=0; i<BATCH; i++)
for (int j=0; j<DATASIZE; j++) hostInputData[i*DATASIZE + j] = (cufftReal)(i + 1);
// --- Device side input data allocation and initialization
cufftReal *deviceInputData; gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
cudaMemcpy(deviceInputData, hostInputData, DATASIZE * BATCH * sizeof(cufftReal), cudaMemcpyHostToDevice);
// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));
// --- Device side output data allocation
cufftComplex *deviceOutputData; gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
// --- Batched 1D FFTs
cufftHandle handle;
int rank = 1; // --- 1D FFTs
int n[] = { DATASIZE }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH; // --- Number of batched executions
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_R2C, batch);
//cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
cufftExecR2C(handle, deviceInputData, deviceOutputData);
// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
for (int i=0; i<BATCH; i++)
for (int j=0; j<(DATASIZE / 2 + 1); j++)
printf("%i %i %f %f\n", i, j, hostOutputData[i*(DATASIZE / 2 + 1) + j].x, hostOutputData[i*(DATASIZE / 2 + 1) + j].y);
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
}
请根据CUFFT error handling添加您自己的cuFFT错误检查。