我是CUDA和CUFFT的新手,当我尝试通过应用相应的cufftExecC2R(...)
恢复cufftExecC2R(...)
的fft结果时,它出错了,恢复的数据和原始数据不相同
这是代码,我使用的cuda库是cuda-9.0。
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include "cufft.h"
#include <iostream>
#include <sys/time.h>
#include <cstdio>
#include <cmath>
using namespace std;
// cuda error check
#define gpuErrchk(ans) {gpuAssrt((ans), __FILE__, __LINE__);}
inline void gpuAssrt(cudaError_t code, const char* file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) {
exit(code);
}
}
}
// ifft scale for cufft
__global__ void IFFTScale(int scale_, cufftReal* real) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
real[idx] *= 1.0 / scale_;
}
void batch_1d_irfft2_test() {
const int BATCH = 3;
const int DATASIZE = 4;
/// RFFT
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
for (int i = 0; i < BATCH; ++ i) {
for (int j = 0; j < DATASIZE; ++ j) {
hostInputData[i * DATASIZE + j] = (cufftReal)(i * DATASIZE + j + 1);
}
}
// DEBUG:print host input data
cout << "print host input data" << endl;
for (int i = 0; i < BATCH; ++ i) {
for (int j = 0; j < DATASIZE; ++ j) {
cout << hostInputData[i * DATASIZE + j] << ", ";
}
cout << endl;
}
cout << "=====================================================" << endl;
// --- Device side input data allocation and initialization
cufftReal *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
// --- Device side output data allocation
cufftComplex *deviceOutputData;
gpuErrchk(cudaMalloc(
(void**)&deviceOutputData,
(DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
// Host sice input data copied to Device side
cudaMemcpy(deviceInputData,
hostInputData,
DATASIZE * BATCH * sizeof(cufftReal),
cudaMemcpyHostToDevice);
// --- Batched 1D FFTs
cufftHandle handle;
int rank = 1; // --- 1D FFTs
int n[] = {DATASIZE}; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = DATASIZE / 2 + 1; // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH; // --- Number of batched executions
cufftPlanMany(
&handle,
rank,
n,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_R2C,
batch);
cufftExecR2C(handle, deviceInputData, deviceOutputData);
// **************************************************************************
/// IRFFT
cufftReal *deviceOutputDataIFFT;
gpuErrchk(cudaMalloc((void**)&deviceOutputDataIFFT, DATASIZE * BATCH * sizeof(cufftReal)));
// --- Batched 1D IFFTs
cufftHandle handleIFFT;
int n_ifft[] = {DATASIZE / 2 + 1}; // --- Size of the Fourier transform
idist = DATASIZE / 2 + 1; odist = DATASIZE; // --- Distance between batches
cufftPlanMany(
&handleIFFT,
rank,
n_ifft,
inembed, istride, idist,
onembed, ostride, odist,
CUFFT_C2R,
batch);
cufftExecC2R(handleIFFT, deviceOutputData, deviceOutputDataIFFT);
/* scale
// dim3 dimGrid(512);
// dim3 dimBlock(max((BATCH * DATASIZE + 512 - 1) / 512, 1));
// IFFTScale<<<dimGrid, dimBlock>>>((DATASIZE - 1) * 2, deviceOutputData);
*/
// host output data for ifft
cufftReal *hostOutputDataIFFT = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
cudaMemcpy(hostOutputDataIFFT,
deviceOutputDataIFFT,
DATASIZE * BATCH * sizeof(cufftReal),
cudaMemcpyDeviceToHost);
// print IFFT recovered host output data
cout << "print host output IFFT data" << endl;
for (int i=0; i<BATCH; i++) {
for (int j=0; j<DATASIZE; j++) {
cout << hostOutputDataIFFT[i * DATASIZE + j] << ", ";
}
printf("\n");
}
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
gpuErrchk(cudaFree(deviceOutputDataIFFT));
free(hostOutputDataIFFT);
free(hostInputData);
}
int main() {
batch_1d_irfft2_test();
return 0;
}
我通过nvcc -o rfft_test rfft_test.cu -lcufft
编译了'rfft_test.cu'文件。结果如下:
print host input data
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
=====================================================
print IFFT recovered host output data
6, 8.5359, 15.4641, 0,
22, 24.5359, 31.4641, 0,
38, 40.5359, 47.4641, 0,
具体来说,我检查了cufftExecC2R(...)
的规模问题,并注释了IFFTScale()
内核函数。因此,我假设恢复的输出数据就像DATASIZE*input_batched_1d_data
,但即使这样,结果也不如预期。
我已经多次检查了cufft手册和代码,还搜索了一些Nvidia论坛和StackOverflow答案,但是没有找到任何解决方案。非常感谢任何人的帮助。 预先感谢。
答案 0 :(得分:0)
逆变换的大小不正确,应为DATASIZE而不是DATASIZE / 2 + 1。
cuFFT文档的以下部分应有所帮助:
“在C2R模式下,只需要非冗余复数元素的输入数组(x 1,x 2,…,x N 2 + 1)。” -N是您传递给计划功能的变换大小