我是CUDA的新手,我正在尝试使用CuFFT库在GPU上执行FFT。问题是,当我第一次运行编译的代码时需要大约500us,但是如果我立即再次运行它而不需要等待它需要大约175us。 (我从cmd运行.exe)。每当我暂停一下,计算FFT需要500us。对于不同的NX点DFT,它给出了几乎相同的时序。我正在计算从FFT开始到结束的时间,而不是数据复制时间。谁能告诉我这是什么问题。我期待的是高NX点FFT的执行时间更长,但它几乎占用了相同的时间。以下是我的代码。
我正在使用Visual C ++ 2010 Express和CUDA v6.0。系统规格:Corei7 3.60Ghz,RAM:16GB,GPU:GeForce GT640(使用相同的GPU进行显示和计算)
任何建议和帮助都将不胜感激。
// Raw Data Generation
#define TABLE_SIZE 1000
#define TWO_PI (3.14159 * 2)
#define CYCLES 20
#define NUMBER_OF_SAMPLES (TABLE_SIZE*CYCLES)
// FFT Values
#define NX 2048 // NX-point DFT
#define BATCH 1
// Sine Generator Function
void sin_func(float *sample_ptr)
{
float phaseIncrement = TWO_PI/TABLE_SIZE;
float currentPhase = 0.0;
int i;
for (i = 0; i < CYCLES*TABLE_SIZE; i ++){
*sample_ptr = sin(currentPhase);
sample_ptr = sample_ptr + sizeof(float)/4;
currentPhase += phaseIncrement;
}
}
void main()
{
const int ARRAY_SIZE = NUMBER_OF_SAMPLES*sizeof(float);
const int FFT_OUT_SIZE = sizeof(cufftComplex)*(NX/2+1)*BATCH;
// Variable Declaration for execution time computation
LARGE_INTEGER ticksPerSecond;
LARGE_INTEGER startTick; // A point in time
LARGE_INTEGER starttime; // For converting tick into real time
LARGE_INTEGER endTick; // A point in time
LARGE_INTEGER endtime; // For converting tick into real time
// get the high resolution counter's accuracy
QueryPerformanceFrequency(&ticksPerSecond);
//
// Initialization of input data on Host
float h_rawdata[NUMBER_OF_SAMPLES];
float h_checkdata[NUMBER_OF_SAMPLES];
sin_func(&h_rawdata[0]);
// Display values in the resulting array
for (int i =0; i < 12 ; i++) {
printf("%f", h_rawdata[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
//Initializing output array on Host
cufftComplex h_fftout[FFT_OUT_SIZE];
//Allocate memory on GPU
float *d_rawdata;
float *d_checkdata;
cufftHandle plan;
cufftComplex *d_fftout;
cudaMalloc((void**)&d_rawdata, ARRAY_SIZE);
cudaMalloc((void**)&d_checkdata, ARRAY_SIZE); // For Testing Only
cudaMalloc((void**)&d_fftout, FFT_OUT_SIZE);
//copying data to device(GPU) memory
cudaMemcpy (d_rawdata, h_rawdata, ARRAY_SIZE, cudaMemcpyHostToDevice);
// ** Doing FFT ** //
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
// fft starting
QueryPerformanceCounter(&startTick); // Time stamp at start of FFT
//if (cufftExecR2C(plan, (cufftReal*)d_rawdata, d_fftout) != CUFFT_SUCCESS){
if (cufftExecR2C(plan, d_rawdata, d_fftout) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
QueryPerformanceCounter(&endTick); // Time stamp at end End of FFT
// ** Doing Inverse FFT ** //
if (cufftPlan1d(&plan, NX, CUFFT_C2R, BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
if (cufftExecC2R(plan, d_fftout, d_checkdata) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
// Copying Data Back to Host
cudaMemcpy (h_fftout, d_fftout, FFT_OUT_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy (h_checkdata, d_checkdata, ARRAY_SIZE, cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(d_rawdata);
cudaFree(d_fftout);
printf("\n");
// Displaying the resulting array
for (int i =0; i < 12 ; i++) {
printf("%f", h_checkdata[i]/NX);
printf(((i % 4) != 3) ? "\t" : "\n");
}
/// Ticks conversion
// convert the tick number into the number of seconds
// since the system was started...
starttime.QuadPart = startTick.QuadPart/ticksPerSecond.QuadPart;
endtime.QuadPart = endTick.QuadPart/ticksPerSecond.QuadPart;
//get the number of hours
int starthours = starttime.QuadPart/3600;
int endhours = endtime.QuadPart/3600;
//get the number of minutes
starttime.QuadPart = starttime.QuadPart - (starthours * 3600);
endtime.QuadPart = endtime.QuadPart - (endhours * 3600);
int startminutes = starttime.QuadPart/60;
int endminutes = endtime.QuadPart/60;
//get the number of seconds
int startseconds = starttime.QuadPart - (startminutes * 60);
int endseconds = starttime.QuadPart - (endminutes *60);
double ticks_per_micro= (double)ticksPerSecond.QuadPart/1000000;
//printf ("\n div = %f",ticks_per_micro);
//get the number of Microseconds
double startmicroSecondes = (double)((startTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);
double endmicroSecondes = (double)((endTick.QuadPart % ticksPerSecond.QuadPart) / ticks_per_micro);
printf ("\n FFT Started %d:%d:%d::%.2f",starthours, startminutes, startseconds, startmicroSecondes);
printf ("\n FFT Ended %d:%d:%d::%.2f \n",endhours, endminutes, endseconds, endmicroSecondes);
printf ("\nFFT computation time for %d point DFT: %.2fus \n", NX, endmicroSecondes - startmicroSecondes);
}