Question

假设我的序列x(n)长K * N且只有第一个N元素与零不同。我假设N << K，例如，N = 10和K = 100000。我想通过FFTW计算这种序列的FFT。这相当于具有长度N的序列并且填充为K * N的零填充。由于N和K可能是“大”，因此我有一个重要的零填充。我正在探索是否可以节省一些计算时间，避免显式零填充。

案例K = 2

让我们首先考虑案例K = 2。在这种情况下，x(n)的DFT可以写为

如果k是偶数，即k = 2 * m，那么

这意味着DFT的这些值可以通过长度为N但不是K * N的序列的FFT来计算。

如果k是奇数，即k = 2 * m + 1，那么

这意味着可以通过长度为N但不是K * N的序列的FFT再次计算DFT的这些值。

因此，总之，我可以用长度为2 * N的{{1}} FFT交换长度为2的单个FFT。

任意N

的情况

在这种情况下，我们有

在撰写K时，我们有

因此，总而言之，我可以用k = m * K + t长度为K * N的FFT交换长度为K的单个FFT。由于FFTW具有N，因此我可以期望在单个FFT的情况下获得一些增益。

要验证，我已设置以下代码

fftw_plan_many_dft

我开发的方法包括三个步骤：

将输入序列乘以“旋转”复数指数;
执行#include <stdio.h> #include <stdlib.h> /* srand, rand */ #include <time.h> /* time */ #include <math.h> #include <fstream> #include <fftw3.h> #include "TimingCPU.h" #define PI_d 3.141592653589793 void main() { const int N = 10; const int K = 100000; fftw_plan plan_zp; fftw_complex *h_x = (fftw_complex *)malloc(N * sizeof(fftw_complex)); fftw_complex *h_xzp = (fftw_complex *)calloc(N * K, sizeof(fftw_complex)); fftw_complex *h_xpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); fftw_complex *h_xhatpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); fftw_complex *h_xhatpruning_temp = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); fftw_complex *h_xhat = (fftw_complex *)malloc(N * K * sizeof(fftw_complex)); // --- Random number generation of the data sequence srand(time(NULL)); for (int k = 0; k < N; k++) { h_x[k][0] = (double)rand() / (double)RAND_MAX; h_x[k][1] = (double)rand() / (double)RAND_MAX; } memcpy(h_xzp, h_x, N * sizeof(fftw_complex)); plan_zp = fftw_plan_dft_1d(N * K, h_xzp, h_xhat, FFTW_FORWARD, FFTW_ESTIMATE); fftw_plan plan_pruning = fftw_plan_many_dft(1, &N, K, h_xpruning, NULL, 1, N, h_xhatpruning_temp, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE); TimingCPU timerCPU; timerCPU.StartCounter(); fftw_execute(plan_zp); printf("Stadard %f\n", timerCPU.GetCounter()); timerCPU.StartCounter(); double factor = -2. * PI_d / (K * N); for (int k = 0; k < K; k++) { double arg1 = factor * k; for (int n = 0; n < N; n++) { double arg = arg1 * n; double cosarg = cos(arg); double sinarg = sin(arg); h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg; h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg; } } printf("Optimized first step %f\n", timerCPU.GetCounter()); timerCPU.StartCounter(); fftw_execute(plan_pruning); printf("Optimized second step %f\n", timerCPU.GetCounter()); timerCPU.StartCounter(); for (int k = 0; k < K; k++) { for (int p = 0; p < N; p++) { h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0]; h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1]; } } printf("Optimized third step %f\n", timerCPU.GetCounter()); double rmserror = 0., norm = 0.; for (int n = 0; n < N; n++) { rmserror = rmserror + (h_xhatpruning[n][0] - h_xhat[n][0]) * (h_xhatpruning[n][0] - h_xhat[n][0]) + (h_xhatpruning[n][1] - h_xhat[n][1]) * (h_xhatpruning[n][1] - h_xhat[n][1]); norm = norm + h_xhat[n][0] * h_xhat[n][0] + h_xhat[n][1] * h_xhat[n][1]; } printf("rmserror %f\n", 100. * sqrt(rmserror / norm)); fftw_destroy_plan(plan_zp); };
重组结果。

fftw_many比fftw_many输入点上的单个FFTW快。但是，步骤＃1和＃3完全破坏了这种增益。我希望步骤＃1和＃3在计算上比步骤＃2轻得多。

我的问题是：

步骤＃1和＃3如何比步骤＃2计算要求更高？
如何改进步骤＃1和＃3以获得“标准”方法的净收益？

非常感谢您的任何暗示。

修改

我正在使用Visual Studio 2013并在发布模式下进行编译。

Answer 1

可以更快地运行的几个选项：

如果您只运行单线程并且有多个可用核心，则运行多线程。
创建并保存FFTW智慧文件，尤其是在预先知道FFT尺寸的情况下。使用FFTW_EXHAUSTIVE，重新加载FFTW智慧，而不是每次都重新计算。如果您希望结果一致，这也很重要。由于FFTW可能以不同的计算智慧计算FFT，并且智慧结果不一定总是相同的，因此当给出相同的输入数据时，不同的运行过程可能会产生不同的结果。
如果您使用的是x86，请运行64位。 FFTW算法的寄存器密集程度极高，运行在64位模式下的x86 CPU比32位模式运行的通用寄存器要多得多。
由于FFTW算法是寄存器密集型的，因此我通过使用编译器选项编译FFTW来提高FFTW性能，这些选项阻止使用预取并防止隐式内联函数

Answer 2

对于第三步，您可能想尝试切换循环的顺序：

for (int p = 0; p < N; p++) {
    for (int k = 0; k < K; k++) {
        h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
        h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
    }
}

因为存储地址与加载地址相邻通常更有利。

但是，无论哪种方式，您都拥有缓存不友好的访问模式。您可以尝试使用块来改善这一点，例如假设N是4的倍数：

for (int p = 0; p < N; p += 4) {
    for (int k = 0; k < K; k++) {
        for (int p0 = 0; p0 < 4; p0++) {
            h_xhatpruning[(p + p0) * K + k][0] = h_xhatpruning_temp[(p + p0) + k * N][0];
            h_xhatpruning[(p + p0) * K + k][1] = h_xhatpruning_temp[(p + p0) + k * N][1];
        }
    }
}

这应该有助于减少缓存行的流失。如果确实如此，那么也可以尝试使用4以外的块大小来查看是否存在“最佳位置”。

Answer 3

在Paul R的评论之后，我改进了我的代码。现在，替代方法比标准（零填充）更快。下面是完整的C ++脚本。对于步骤＃1和＃3，我已经评论了其他尝试过的解决方案，这些解决方案已经显示出比未注释的解决方案更慢或更快。鉴于未来更简单的CUDA并行化，我还拥有非嵌套for循环的特权。我还没有为FFTW使用多线程。

#include <stdio.h>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */
#include <math.h>
#include <fstream>

#include <omp.h>

#include <fftw3.h>

#include "TimingCPU.h"

#define PI_d            3.141592653589793

/******************/
/* STEP #1 ON CPU */
/******************/
void step1CPU(fftw_complex * __restrict h_xpruning, const fftw_complex * __restrict h_x, const int N, const int K) {

//  double factor = -2. * PI_d / (K * N);
//  int n;
//  omp_set_nested(1);
//#pragma omp parallel for private(n) num_threads(4)
//  for (int k = 0; k < K; k++) {
//      double arg1 = factor * k;
//#pragma omp parallel for num_threads(4)
//      for (n = 0; n < N; n++) {
//          double arg = arg1 * n;
//          double cosarg = cos(arg);
//          double sinarg = sin(arg);
//          h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
//          h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
//      }
//  }

    //double factor = -2. * PI_d / (K * N);
    //int k;
    //omp_set_nested(1);
    //#pragma omp parallel for private(k) num_threads(4)
    //for (int n = 0; n < N; n++) {
    //  double arg1 = factor * n;
    //  #pragma omp parallel for num_threads(4)
    //  for (k = 0; k < K; k++) {
    //      double arg = arg1 * k;
    //      double cosarg = cos(arg);
    //      double sinarg = sin(arg);
    //      h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
    //      h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
    //  }
    //}

    //double factor = -2. * PI_d / (K * N);
    //for (int k = 0; k < K; k++) {
    //  double arg1 = factor * k;
    //  for (int n = 0; n < N; n++) {
    //      double arg = arg1 * n;
    //      double cosarg = cos(arg);
    //      double sinarg = sin(arg);
    //      h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
    //      h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
    //  }
    //}

    //double factor = -2. * PI_d / (K * N);
    //for (int n = 0; n < N; n++) {
    //  double arg1 = factor * n;
    //  for (int k = 0; k < K; k++) {
    //      double arg = arg1 * k;
    //      double cosarg = cos(arg);
    //      double sinarg = sin(arg);
    //      h_xpruning[k * N + n][0] = h_x[n][0] * cosarg - h_x[n][1] * sinarg;
    //      h_xpruning[k * N + n][1] = h_x[n][0] * sinarg + h_x[n][1] * cosarg;
    //  }
    //}

    double factor = -2. * PI_d / (K * N);
    #pragma omp parallel for num_threads(8)
    for (int n = 0; n < K * N; n++) {
        int row = n / N;
        int col = n % N;
        double arg = factor * row * col;
        double cosarg = cos(arg);
        double sinarg = sin(arg);
        h_xpruning[n][0] = h_x[col][0] * cosarg - h_x[col][1] * sinarg;
        h_xpruning[n][1] = h_x[col][0] * sinarg + h_x[col][1] * cosarg;
    }
}

/******************/
/* STEP #3 ON CPU */
/******************/
void step3CPU(fftw_complex * __restrict h_xhatpruning, const fftw_complex * __restrict h_xhatpruning_temp, const int N, const int K) {

    //int k;
    //omp_set_nested(1);
    //#pragma omp parallel for private(k) num_threads(4)
    //for (int p = 0; p < N; p++) {
    //  #pragma omp parallel for num_threads(4)
    //  for (k = 0; k < K; k++) {
    //      h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
    //      h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
    //  }
    //} 

    //int p;
    //omp_set_nested(1);
    //#pragma omp parallel for private(p) num_threads(4)
    //for (int k = 0; k < K; k++) {
    //  #pragma omp parallel for num_threads(4)
    //  for (p = 0; p < N; p++) {
    //      h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
    //      h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
    //  }
    //}

    //for (int p = 0; p < N; p++) {
    //  for (int k = 0; k < K; k++) {
    //      h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
    //      h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
    //  }
    //}

    //for (int k = 0; k < K; k++) {
    //  for (int p = 0; p < N; p++) {
    //      h_xhatpruning[p * K + k][0] = h_xhatpruning_temp[p + k * N][0];
    //      h_xhatpruning[p * K + k][1] = h_xhatpruning_temp[p + k * N][1];
    //  }
    //}

    #pragma omp parallel for num_threads(8)
    for (int p = 0; p < K * N; p++) {
        int col = p % N;
        int row = p / K;
        h_xhatpruning[col * K + row][0] = h_xhatpruning_temp[col + row * N][0];
        h_xhatpruning[col * K + row][1] = h_xhatpruning_temp[col + row * N][1];
    }

    //for (int p = 0; p < N; p += 2) {
    //  for (int k = 0; k < K; k++) {
    //      for (int p0 = 0; p0 < 2; p0++) {
    //          h_xhatpruning[(p + p0) * K + k][0] = h_xhatpruning_temp[(p + p0) + k * N][0];
    //          h_xhatpruning[(p + p0) * K + k][1] = h_xhatpruning_temp[(p + p0) + k * N][1];
    //      }
    //  }
    //}

}

/********/
/* MAIN */
/********/
void main() {

    int N = 10;
    int K = 100000;

    // --- CPU memory allocations
    fftw_complex *h_x = (fftw_complex *)malloc(N     * sizeof(fftw_complex));
    fftw_complex *h_xzp = (fftw_complex *)calloc(N * K, sizeof(fftw_complex));
    fftw_complex *h_xpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
    fftw_complex *h_xhatpruning = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
    fftw_complex *h_xhatpruning_temp = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
    fftw_complex *h_xhat = (fftw_complex *)malloc(N * K * sizeof(fftw_complex));
    //double2        *h_xhatGPU = (double2 *)malloc(N * K * sizeof(double2));


    // --- Random number generation of the data sequence on the CPU - moving the data from CPU to GPU
    srand(time(NULL));
    for (int k = 0; k < N; k++) {
        h_x[k][0] = (double)rand() / (double)RAND_MAX;
        h_x[k][1] = (double)rand() / (double)RAND_MAX;
    }
    //gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double2), cudaMemcpyHostToDevice));

    memcpy(h_xzp, h_x, N * sizeof(fftw_complex));

    // --- FFTW and cuFFT plans
    fftw_plan h_plan_zp      = fftw_plan_dft_1d(N * K, h_xzp, h_xhat, FFTW_FORWARD, FFTW_ESTIMATE);
    fftw_plan h_plan_pruning = fftw_plan_many_dft(1, &N, K, h_xpruning, NULL, 1, N, h_xhatpruning_temp, NULL, 1, N, FFTW_FORWARD, FFTW_ESTIMATE);

    double totalTimeCPU = 0., totalTimeGPU = 0.;
    double partialTimeCPU, partialTimeGPU;

    /****************************/
    /* STANDARD APPROACH ON CPU */
    /****************************/
    printf("Number of processors available = %i\n", omp_get_num_procs());
    printf("Number of threads              = %i\n", omp_get_max_threads());

    TimingCPU timerCPU;
    timerCPU.StartCounter();
    fftw_execute(h_plan_zp);
    printf("\nStadard on CPU: \t \t %f\n", timerCPU.GetCounter());

    /******************/
    /* STEP #1 ON CPU */
    /******************/
    timerCPU.StartCounter();
    step1CPU(h_xpruning, h_x, N, K);
    partialTimeCPU = timerCPU.GetCounter();
    totalTimeCPU = totalTimeCPU + partialTimeCPU;
    printf("\nOptimized first step CPU: \t %f\n", totalTimeCPU);

    /******************/
    /* STEP #2 ON CPU */
    /******************/
    timerCPU.StartCounter();
    fftw_execute(h_plan_pruning);
    partialTimeCPU = timerCPU.GetCounter();
    totalTimeCPU = totalTimeCPU + partialTimeCPU;
    printf("Optimized second step CPU: \t %f\n", timerCPU.GetCounter());

    /******************/
    /* STEP #3 ON CPU */
    /******************/
    timerCPU.StartCounter();
    step3CPU(h_xhatpruning, h_xhatpruning_temp, N, K);
    partialTimeCPU = timerCPU.GetCounter();
    totalTimeCPU = totalTimeCPU + partialTimeCPU;
    printf("Optimized third step CPU: \t %f\n", partialTimeCPU);

    printf("Total time CPU: \t \t %f\n", totalTimeCPU);

    double rmserror = 0., norm = 0.;
    for (int n = 0; n < N; n++) {
        rmserror = rmserror + (h_xhatpruning[n][0] - h_xhat[n][0]) * (h_xhatpruning[n][0] - h_xhat[n][0]) + (h_xhatpruning[n][1] - h_xhat[n][1]) * (h_xhatpruning[n][1] - h_xhat[n][1]);
        norm = norm + h_xhat[n][0] * h_xhat[n][0] + h_xhat[n][1] * h_xhat[n][1];
    }
    printf("\nrmserror %f\n", 100. * sqrt(rmserror / norm));

    fftw_destroy_plan(h_plan_zp);

}

案例

N = 10
K = 100000

我的时间安排如下

Stadard on CPU:                  23.895417

Optimized first step CPU:        4.472087
Optimized second step CPU:       4.926603
Optimized third step CPU:        2.394958
Total time CPU:                  11.793648

加速FFTW修剪以避免大量零填充

3 个答案: