我正在尝试使用visual studio 2010在CUDA中开发FFT的实现,到目前为止,我已经使它在一个块内最多可以工作1024个点。问题在于,每当我使用多个块时,块1的结果都会正常,而其他块将返回错误的值(似乎不是随机的,它们在多次运行中不会改变。)这是我的内核
__device__ void FFT(int idxS,int bfsize, Complex* data1, Complex* data0, int k, int N ){
Complex alpha;
if((idxS % bfsize) < (bfsize/2)){
data1[idxS] = ComplexAdd(data0[idxS],data0[idxS+bfsize/2]);
}
else
{
float angle = -PI*2*((idxS*(1<<k)%(bfsize/2)))/N;
alpha.x = cos(angle);
alpha.y= sin(angle);
Complex v0;
v0 = ComplexAdd(data0[idxS-bfsize/2] ,ComplexScale(data0[idxS],-1));
data1[idxS] = ComplexMul(v0, alpha);
}
}
__device__ void Ordenador(int r, int idxS ,Complex* data1, Complex* data0 ){
int p = 0;
for(int k = 0;k < r;k++)
{
if(idxS & (1<<k))
p+=1<<(r - k - 1);
}
data1[idxS] = data0[p];
__syncthreads();
}
__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
__syncthreads;
int bfsize = 1<<(r - k);
FFT(idxS, bfsize, data1, data0, k, N);
data0[idxS] = data1[idxS];
}
int prepFFT(float *Entrada, Complex* saida, int N ){
if(ceilf(log2((float)N)) == log2((float)N) ){
for (int i=0; i<N; i++){
saida[i].x = Entrada[i];
saida[i].y = 0;
}
Complex *d_saida;
int m = (int)log2((float)N);
Complex *data1 = new Complex[N];
Complex *data1_d;
if (N<1024){
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (N,1,1);
const dim3 numBlocks(1,1,1);
for(int k = 0 ;k < m ; k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaDeviceSynchronize());
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
}
else{
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (1024,1,1);
const dim3 numBlocks(N/1024 +1,1,1);
for(int k = 0;k < m;k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
cudaFree(data1_d);
cudaFree(d_saida);
delete data1;
}
return 1;
}
else
return 0;
}
我尝试过使用共享内存,但它会返回所有0并且我认为CUDA没有从全局复制到共享(NSight会告诉我该内存位置的值是????)。这段代码应该只是现在的概念证明,不必优化,只需返回正确的值。如果你们需要完整的代码,我会提供它。我一直在寻找一个解决方案超过一个月,这是我绝望的电话。
谢谢, 约翰
-------更新--------
为了调试目的,我更改了代码,在2个块中的每个块中启动了2个线程。
int prepFFT(float *Entrada, Complex* saida, int N ){
if(ceilf(log2((float)N)) == log2((float)N) ){
for (int i=0; i<N; i++){
saida[i].x = Entrada[i];
saida[i].y = 0;
}
Complex *d_saida;
int m = (int)log2((float)N);
Complex *data1 = new Complex[N];
Complex *data1_d;
if (N<1024){
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (2,1,1);
const dim3 numBlocks(2,1,1);
for(int k = 0 ;k < m ; k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaDeviceSynchronize());
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
}
else{
HANDLE_ERROR (cudaMalloc((void**)&d_saida, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(d_saida,saida, sizeof(Complex)*N, cudaMemcpyHostToDevice));
HANDLE_ERROR (cudaMalloc((void**)&data1_d, sizeof(Complex) * N));
HANDLE_ERROR (cudaMemcpy(data1_d,data1, sizeof(Complex)*N, cudaMemcpyHostToDevice));
const dim3 numThreads (1024,1,1);
const dim3 numBlocks(N/1024 +1,1,1);
for(int k = 0;k < m;k++)
{
GPU_FFT<<<numBlocks,numThreads, N*2>>>( N, m, d_saida, data1_d, k);
HANDLE_ERROR (cudaDeviceSynchronize());
}
HANDLE_ERROR (cudaMemcpy(saida,data1_d, sizeof(Complex)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR (cudaDeviceSynchronize());
cudaFree(data1_d);
cudaFree(d_saida);
delete data1;
}
return 1;
}
else
return 0;
}
---------------------编辑2 ---------------------
真正奇怪的是,当我使用memcheck(在任何模式下)时,程序会返回正确的结果。
----最终编辑---------------
我发现问题出现在这段代码中
FFT(idxS, bfsize, data1, data0, k, N);
data0[idxS] = data1[idxS];
我发现将新函数中的最后一行分开并用CPU调用它会为我生成正确的结果。 感谢您的帮助!! 最诚挚的问候!
答案 0 :(得分:2)
首先,您应该检查主要内核函数__global__ void GPU_FFT
是否存在问题
只需将其更改为:
__global__ void GPU_FFT(int N, int r, Complex* data0, Complex* data1, int k) {
int idxS = threadIdx.x+ blockIdx.x * blockDim.x;
int bfsize = 1<<(r - k);
//FFT(idxS, bfsize, data1, data0, k, N);
//data0[idxS] = data1[idxS];
if (idxS <= N) data0[idxS] = idxS;
}
现在第二个街区会发生什么?
如果可以取消注释//FFT(idxS, bfsize, data1, data0, k, N);
并将最后一行更改为:
if (idxS <= N) data0[idxS] = data1[idxS];
现在发生了什么?仍旧的错误?
P.S。并且在检索线程索引后不需要__syncthreads;
UPD。
if((idxS % bfsize) < (bfsize/2)){
__syncthreads;
...}