Question

我在cufft（cuda 9）（Nvidia 1080）中运行了以下代码。该代码对于所有执行都是相同的。但是，执行时间（在代码下方）变化很大。任何人都可以描述一下如何始终获得最短的时间以及造成这种现象的原因吗？

int NX 2048
int BATCH 96

cufftHandle plan;
cufftHandle rev_plan;
cufftDoubleReal *idata;
cufftDoubleComplex *odata;

int BLOCKSIZE  = 1024;
int gridSize = (NX * BATCH)/BLOCKSIZE;

cufftPlan1d(&plan, NX, CUFFT_D2Z, BATCH);
cufftPlan1d(&rev_plan, NX, CUFFT_Z2D, BATCH);


cudaMalloc((void **) &idata, sizeof(cufftDoubleReal) * NX * BATCH);
cudaMalloc((void **) &odata, sizeof(cufftDoubleComplex) * (NX / 2 + 1) * BATCH);
inputData << < gridSize, BLOCKSIZE >> > (idata, NX * BATCH);

double sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
    cufftExecD2Z(plan, idata, odata);
    cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);

sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
    cufftExecD2Z(plan, idata, odata);
    cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);

sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
    cufftExecD2Z(plan, idata, odata);
    cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);

sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
    cufftExecD2Z(plan, idata, odata);
    cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);

cudaFree(idata);
cudaFree(odata);

花费时间：0.004334 花费时间：0.022906 花费时间：0.027820 花费时间：0.027786

Answer 1

对cufft例程的调用可以异步

这意味着呼叫可能在完成工作之前返回。

只能出现一定的限制。有一个异步启动队列。填充队列后，仅在调度队列项目时才打开队列中的新插槽。这意味着启动过程不再是异步的。

这使您的计时结果不正确。

要“修复”此问题，请在每个计时区域的末尾（即紧接在每个cudaDeviceSynchronize();语句之前）添加一个printf调用。这将使结果相当均匀。这将迫使所有GPU工作在完成时序测量之前完成。

$ cat t37.cu
#include <cufft.h>
#include <omp.h>
#include <cuda_runtime_api.h>
#include <cstdio>

int main(){

  const int NX = 2048;
  const int BATCH = 96;

  cufftHandle plan;
  cufftHandle rev_plan;
  cufftDoubleReal *idata;
  cufftDoubleComplex *odata;

  //int BLOCKSIZE  = 1024;
  //int gridSize = (NX * BATCH)/BLOCKSIZE;

  cufftPlan1d(&plan, NX, CUFFT_D2Z, BATCH);
  cufftPlan1d(&rev_plan, NX, CUFFT_Z2D, BATCH);


  cudaMalloc((void **) &idata, sizeof(cufftDoubleReal) * NX * BATCH);
  cudaMalloc((void **) &odata, sizeof(cufftDoubleComplex) * (NX / 2 + 1) * BATCH);
  //inputData << < gridSize, BLOCKSIZE >> > (idata, NX * BATCH);

  double sT = omp_get_wtime();
  for (int i = 0; i < 500; ++i) {
            cufftExecD2Z(plan, idata, odata);
            cufftExecZ2D(plan, odata, idata);
  }
  #ifdef FIX
  cudaDeviceSynchronize();
  #endif
  printf("Time taken: %f\n", omp_get_wtime() - sT);

  sT = omp_get_wtime();
  for (int i = 0; i < 500; ++i) {
            cufftExecD2Z(plan, idata, odata);
            cufftExecZ2D(plan, odata, idata);
  }
  #ifdef FIX
  cudaDeviceSynchronize();
  #endif
  printf("Time taken: %f\n", omp_get_wtime() - sT);

  sT = omp_get_wtime();
  for (int i = 0; i < 500; ++i) {
            cufftExecD2Z(plan, idata, odata);
            cufftExecZ2D(plan, odata, idata);
  }
  #ifdef FIX
  cudaDeviceSynchronize();
  #endif
  printf("Time taken: %f\n", omp_get_wtime() - sT);

  sT = omp_get_wtime();
  for (int i = 0; i < 500; ++i) {
            cufftExecD2Z(plan, idata, odata);
            cufftExecZ2D(plan, odata, idata);
  }
  #ifdef FIX
  cudaDeviceSynchronize();
  #endif
  printf("Time taken: %f\n", omp_get_wtime() - sT);

  cudaFree(idata);
  cudaFree(odata);
}
$ nvcc -o t37 t37.cu -lcufft -lgomp
$ ./t37
Time taken: 0.007373
Time taken: 0.185308
Time taken: 0.196998
Time taken: 0.196857
$ nvcc -o t37 t37.cu -lcufft -lgomp -DFIX
$ ./t37
Time taken: 0.197076
Time taken: 0.196994
Time taken: 0.196937
Time taken: 0.196916
$

一个人可能会问：“为什么没有cudaDeviceSynchronize()调用的总时间明显少于使用它的总时间？”这本质上是由于相同的原因。异步启动队列中充满了未完成的工作，但是程序在启动队列中的所有工作之前终止（没有最终的cudaDeviceSynchronize()）。在每种情况下，这将导致总执行时间之间明显的差异。通过仅添加最后一个cudaDeviceSynchronize()调用，可以观察到这种效果。

为什么以下程序的相同cufft代码花费不同的时间？

1 个答案: