CUFFT:尝试逐行实现矩阵

时间:2014-07-09 00:12:21

标签: matlab cuda fft cufft

我正在尝试复制matlab fft功能,它在矩阵中逐行(或逐列)地执行。每一行都是袖口计划中的一个批次。

我可以使用cufftExecC2C(下面的代码中注释掉的部分可以使用),但不能使用cufftExecR2C。我的代码使用的是cufftPlan1d,但理想情况下我想用cufftPlanMany实现它。

我想知道我做错了什么,如果有更好的方法可以做到这一点。谢谢。

// linker -> input -> additional dependencies -> add 'cufft.lib'
// VC++ Directories -> include directories - > add 'C:\ProgramData\NVIDIA Corporation\CUDA Samples\v6.0\common\inc'

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>

#include <iostream>

#define NX 6
#define NY 5

void printArray(float *my_array);
void printComplexArray(float2 *my_array);

int main(){

/************************************************************ C2C ************************************************************/
/*  
    float2 *initial_array = (float2 *)malloc(sizeof(float2) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++){
            initial_array[NY * h + w].x = 0;
            initial_array[NY * h + w].y = 0;
        }
    }
    initial_array[NY*3 + 0].x = 1;
    initial_array[NY*5 + 0].x = 1;

    printComplexArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * NX * NY);

    cufftComplex *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftComplex));
    cudaMalloc((void **)&gpu_transformed_array, NX*NY*sizeof(cufftComplex));

    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float2), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_C2C, NX);

    cufftExecC2C(plan, gpu_initial_array, gpu_transformed_array, CUFFT_FORWARD);

    cudaMemcpy(transformed_array, gpu_transformed_array, NX*NY*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    printComplexArray(transformed_array);
*/
/************************************************************ C2C ************************************************************/

/************************************************************ R2C ************************************************************/

    float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            initial_array[NY * h + w] = 0;
    }

    initial_array[NY*3 + 0] = 1;

    printArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);

    cufftReal *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal));
    cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex));

    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_R2C, NX);

    //                       ***** cufftPlanMany *****
    //int n[2] = {NX, NY};
    //cufftPlanMany(&plan,1,n,NULL,1,0,NULL,1,0,CUFFT_R2C,NX);

    cufftExecR2C(plan, gpu_initial_array, gpu_transformed_array);

    cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    printComplexArray(transformed_array);

/************************************************************ R2C ************************************************************/

    cufftDestroy(plan);
    free(initial_array);
    free(transformed_array);
    cudaFree(gpu_initial_array);
    cudaFree(gpu_transformed_array);

    std::system("pause");
    return 0;
}

void printArray(float *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w] << " | ";
        std::cout << std::endl; 
    }
    std::cout << std::endl;     
}

void printComplexArray(float2 *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w].x << " + " << my_array[NY * h + w].y << " | ";
        std::cout << std::endl;
    }
    std::cout << std::endl; 
}

1 个答案:

答案 0 :(得分:3)

您的问题似乎与您打印结果的方式有关。对于CUFFT_R2CCUFFT_C2C这两种情况,您无法使用相同的例程进行打印。在前一种情况下,您有一个(NY/2+1)*NX大小的输出,而在后一种情况下,您有一个NY*NX大小的输出。下面的固定代码应该有效。

此外,添加适当的CUDA error checkCUFFT error check也很不错,我也将其添加到下面的代码中。

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <assert.h>

#include <iostream>

#define NX 6
#define NY 5

void printArray(float *my_array);
void printComplexSymmetricArray(float2 *my_array);

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
            return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";
    }

    return "<unknown>";
}

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)

inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                            _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

/********/
/* MAIN */
/********/
int main(){

    float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            initial_array[NY * h + w] = 0;
        }

    initial_array[NY*3 + 0] = 1;

    printArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);

    cufftReal *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    gpuErrchk(cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal)));
    gpuErrchk(cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex)));

    gpuErrchk(cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice));

    cufftHandle plan;
    cufftSafeCall(cufftPlan1d(&plan, NY, CUFFT_R2C, NX));

    cufftSafeCall(cufftExecR2C(plan, (cufftReal*)gpu_initial_array, (cufftComplex*)gpu_transformed_array));

    gpuErrchk(cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost));

    printComplexSymmetricArray(transformed_array);

    cufftSafeCall(cufftDestroy(plan));
    free(initial_array);
    free(transformed_array);
    gpuErrchk(cudaFree(gpu_initial_array));
    gpuErrchk(cudaFree(gpu_transformed_array));

    std::system("pause");
    return 0;
}

/***********************/
/* PRINTOUT REAL ARRAY */
/***********************/
void printArray(float *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w] << " | ";
            std::cout << std::endl; 
        }
    std::cout << std::endl;     
}

/************************************/
/* PRINTOUT COMPLEX SYMMETRIC ARRAY */
/************************************/
void printComplexSymmetricArray(float2 *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY/2+1; w++)
            std::cout << my_array[(NY/2+1) * h + w].x << " + " << my_array[(NY/2+1) * h + w].y << " | ";
            std::cout << std::endl;
    }
    std::cout << std::endl; 
}