我有一个CUDA程序,用于计算大小50000
的FFT。目前,我将整个阵列复制到GPU并执行cuFFT。现在,我正在尝试优化程序,NVIDIA Visual Profiler告诉我通过并行计算并发隐藏memcopy。我的问题是:
例如,是否可以复制第一个5000
元素,然后开始计算,然后将下一批数据并行复制到计算等?
由于DFT基本上是时间值乘以复指数函数的总和,我认为应该可以“逐块”地计算FFT。
是否支持这个?它通常是一个很好的计算理念吗?
修改
更清楚的是,我不想在不同阵列上并行计算不同的FFT。假设我在时域中有很大的正弦信号,我想知道信号中有哪些频率。我的想法是将例如信号长度的三分之一复制到GPU,然后是下一个三分之一并用已经复制的输入值的前三分之一并行计算FFT。然后复制最后一个三分之一并更新输出值,直到处理完所有时间值。所以最后应该有一个输出数组在窦的频率处有一个峰值。
答案 0 :(得分:5)
请考虑以上评论,特别是:
Npartial
元素,则输出为Npartial
元素; 考虑到以上两点,我认为如果您按照下面的代码所示的方式正确使用零填充,您只能“模仿”您想要实现的目标。正如您将看到的那样,将N
作为数据大小,通过将数据划分为 NUM_STREAMS
数据块,代码执行 NUM_STREAMS
零填充< / strong>和流式大小为N
的cuFFT调用。在cuFFT之后,您必须组合(求和)部分结果。
#include <stdio.h>
#include <cufft.h>
#define BLOCKSIZE 32
#define NUM_STREAMS 3
/**********/
/* iDivUp */
/*********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/******************/
/* SUMMING KERNEL */
/******************/
__global__ void kernel(float2 *vec1, float2 *vec2, float2 *vec3, float2 *out, int N) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
out[tid].x = vec1[tid].x + vec2[tid].x + vec3[tid].x;
out[tid].y = vec1[tid].y + vec2[tid].y + vec3[tid].y;
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 600000;
const int Npartial = N / NUM_STREAMS;
// --- Host input data initialization
float2 *h_in1 = new float2[Npartial];
float2 *h_in2 = new float2[Npartial];
float2 *h_in3 = new float2[Npartial];
for (int i = 0; i < Npartial; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out = new float2[N];
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, Npartial*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, Npartial*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out; gpuErrchk(cudaMalloc((void**)&d_out, N*sizeof(float2)));
// --- Zero padding
gpuErrchk(cudaMemset(d_in1, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in2, 0, N*sizeof(float2)));
gpuErrchk(cudaMemset(d_in3, 0, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(&d_in2[Npartial], h_in2, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(&d_in3[2*Npartial], h_in3, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i]));
kernel<<<iDivUp(BLOCKSIZE,N), BLOCKSIZE>>>(d_out1, d_out2, d_out3, d_out, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_out, d_out, N*sizeof(float2), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) printf("i = %i; real(h_out) = %f; imag(h_out) = %f\n", i, h_out[i].x, h_out[i].y);
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
gpuErrchk(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out;
cudaDeviceReset();
return 0;
}
这是在Kepler K20c卡上运行时上述代码的时间线。如您所见,计算与异步内存传输重叠。