Question

我是CUDA和CUFFT的新手，当我尝试通过应用相应的cufftExecC2R(...)恢复cufftExecC2R(...)的fft结果时，它出错了，恢复的数据和原始数据不相同

这是代码，我使用的cuda库是cuda-9.0。

#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include "cufft.h"

#include <iostream>
#include <sys/time.h>
#include <cstdio>
#include <cmath>

using namespace std;


// cuda error check
#define gpuErrchk(ans) {gpuAssrt((ans), __FILE__, __LINE__);}
inline void gpuAssrt(cudaError_t code, const char* file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) {
            exit(code);
        }
    }
}

// ifft scale for cufft
__global__ void IFFTScale(int scale_, cufftReal* real) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    real[idx] *= 1.0 / scale_;
}


void batch_1d_irfft2_test() {
    const int BATCH = 3;
    const int DATASIZE = 4;

    /// RFFT
    // --- Host side input data allocation and initialization
    cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
    for (int i = 0; i < BATCH; ++ i) {
        for (int j = 0; j < DATASIZE; ++ j) {
            hostInputData[i * DATASIZE + j] = (cufftReal)(i * DATASIZE  + j + 1);
        }
    }

    // DEBUG:print host input data
    cout << "print host input data" << endl;
    for (int i = 0; i < BATCH; ++ i) {
        for (int j = 0; j < DATASIZE; ++ j) {
            cout << hostInputData[i * DATASIZE + j] << ", ";
        }
        cout << endl;
    }
    cout << "=====================================================" << endl;

    // --- Device side input data allocation and initialization
    cufftReal *deviceInputData; 
    gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));

    // --- Device side output data allocation
    cufftComplex *deviceOutputData; 
    gpuErrchk(cudaMalloc(
                (void**)&deviceOutputData, 
                (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));

    // Host sice input data copied to Device side 
    cudaMemcpy(deviceInputData, 
            hostInputData, 
            DATASIZE * BATCH * sizeof(cufftReal), 
            cudaMemcpyHostToDevice);

    // --- Batched 1D FFTs
    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = {DATASIZE};                 // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = DATASIZE, odist = DATASIZE / 2 + 1; // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = BATCH;                      // --- Number of batched executions
    cufftPlanMany(
            &handle, 
            rank, 
            n, 
            inembed, istride, idist, 
            onembed, ostride, odist, 
            CUFFT_R2C, 
            batch);
    cufftExecR2C(handle, deviceInputData, deviceOutputData);

    // **************************************************************************
    /// IRFFT
    cufftReal *deviceOutputDataIFFT; 
    gpuErrchk(cudaMalloc((void**)&deviceOutputDataIFFT, DATASIZE * BATCH * sizeof(cufftReal)));

    // --- Batched 1D IFFTs
    cufftHandle handleIFFT;
    int n_ifft[] = {DATASIZE / 2 + 1};                 // --- Size of the Fourier transform
    idist = DATASIZE / 2 + 1; odist = DATASIZE; // --- Distance between batches
    cufftPlanMany(
            &handleIFFT, 
            rank, 
            n_ifft, 
            inembed, istride, idist, 
            onembed, ostride, odist, 
            CUFFT_C2R, 
            batch);
    cufftExecC2R(handleIFFT, deviceOutputData, deviceOutputDataIFFT);

    /* scale
    // dim3 dimGrid(512);
    // dim3 dimBlock(max((BATCH * DATASIZE + 512  - 1) / 512, 1));
    // IFFTScale<<<dimGrid, dimBlock>>>((DATASIZE - 1) * 2, deviceOutputData);
    */

    // host output data for ifft
    cufftReal *hostOutputDataIFFT = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
    cudaMemcpy(hostOutputDataIFFT, 
            deviceOutputDataIFFT, 
            DATASIZE * BATCH * sizeof(cufftReal), 
            cudaMemcpyDeviceToHost);

    // print IFFT recovered host output data
    cout << "print host output IFFT data" << endl;
    for (int i=0; i<BATCH; i++) {
        for (int j=0; j<DATASIZE; j++) {
            cout << hostOutputDataIFFT[i * DATASIZE + j] << ", ";
        }
        printf("\n");
    }

    cufftDestroy(handle);
    gpuErrchk(cudaFree(deviceOutputData));
    gpuErrchk(cudaFree(deviceInputData));
    gpuErrchk(cudaFree(deviceOutputDataIFFT));
    free(hostOutputDataIFFT);
    free(hostInputData);
}

int main() {
    batch_1d_irfft2_test();

    return 0;
}

我通过nvcc -o rfft_test rfft_test.cu -lcufft编译了'rfft_test.cu'文件。结果如下：

print host input data
1, 2, 3, 4, 
5, 6, 7, 8, 
9, 10, 11, 12, 
=====================================================
print IFFT recovered host output data
6, 8.5359, 15.4641, 0, 
22, 24.5359, 31.4641, 0, 
38, 40.5359, 47.4641, 0,

具体来说，我检查了cufftExecC2R(...)的规模问题，并注释了IFFTScale()内核函数。因此，我假设恢复的输出数据就像DATASIZE*input_batched_1d_data，但即使这样，结果也不如预期。

我已经多次检查了cufft手册和代码，还搜索了一些Nvidia论坛和StackOverflow答案，但是没有找到任何解决方案。非常感谢任何人的帮助。预先感谢。

Answer 1

逆变换的大小不正确，应为DATASIZE而不是DATASIZE / 2 + 1。

cuFFT文档的以下部分应有所帮助：

“在C2R模式下，只需要非冗余复数元素的输入数组（x 1，x 2，…，x N 2 + 1）。” -N是您传递给计划功能的变换大小

批量1D ifft的CUFFT错误结果

1 个答案: