Question

我在CUDA（FFT）中做了一些事情，但我不知道为什么在调用内核函数时会产生异常。

所有包含和定义：

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>


#define CPU_ARRAY_SIZE 1024   // 1024, 2048, 4096 8192
#define GPU_ARRAY_SIZE 512   // 
#define THREAD_SIZE 16        // fixed
#define BLOCK_SIZE (GPU_ARRAY_SIZE/THREAD_SIZE)  // 32

#define PI 3.14

当我在NVIDIA GTX480中运行时，我认为它可能是共享内存空间，虽然它似乎不是（因为有“很多”共享变量）。所以，我要更改GPU_ARRAY_SIZE以查看它是如何工作的，当我将它定义为32,64,256,512时，它给了我不同的结果（在512的情况下，它返回全零，我猜CUDA不能做任何事情 - 在其他情况下，它会返回奇怪的，因为我不知道为什么它在没有任何计算的情况下跳过16个单元格的原因。在大多数情况下，在我的Microsoft Visual Studio的“输出”窗口中，它返回数十亿个样式的异常“在.exe中的第一次机会异常0x75b9b9bc：Microsoft C ++异常：内存位置的cudaError_enum”。在你要我调试之前，我无法调试它，因为VS不会对VS无法识别的文件（如.cpp - 至少这个理论在我的情况下工作）那样做。你们对这些问题有任何想法吗？ 1.为什么会产生异常？ 2.为什么计算，它应该为每个块中的每个单元做什么，只在几个单元内

我怎么能解决这个问题......任何想法？

核心功能：

__global__ void twiddle_factor(double *d_isub_matrix, double *d_osub_matrix)
{

    __shared__ double block[THREAD_SIZE][THREAD_SIZE];
    __shared__ double spectrum[THREAD_SIZE][THREAD_SIZE];
    __shared__ double sum_cos[THREAD_SIZE][THREAD_SIZE];  // declaring the shared sum_cos.. similarly for sum_sin
    __shared__ double sum_sin[THREAD_SIZE][THREAD_SIZE];
    __shared__ double local_cos[THREAD_SIZE][THREAD_SIZE];  // declaring the shared sum_cos.. similarly for sum_sin
    __shared__ double local_sin[THREAD_SIZE][THREAD_SIZE];

    unsigned int xIndex = threadIdx.x + blockIdx.x* blockDim.x;
    unsigned int yIndex = threadIdx.y + blockIdx.y* blockDim.y;


    int u;
    int x=0,y=0;

    int tx = threadIdx.x;
    int ty = threadIdx.y;

    double sum_sines=0.0,sum_cosines=0.0;

    double angle=(2*PI)/GPU_ARRAY_SIZE;       

    block[tx][ty] = d_isub_matrix[yIndex*GPU_ARRAY_SIZE+xIndex];

    __syncthreads();


    //for every column!

    for(u=0; u<THREAD_SIZE; u++)
    {

        /* All threads calculate its own sin and cos value. */
        local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
        local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);


        /* Only one row is activate. The thread in row adds all element of its column. */
        if (ty == u) 
        {
            sum_sines   = 0.0;
            sum_cosines = 0.0;

            /* Access each column to add all elements of the column.*/
            for (y=0; y<THREAD_SIZE; y++)
            {
                sum_sines   += local_sin[tx][y];
                sum_cosines += local_cos[tx][y];
            }

            //if (sum_sines < 0) 
                //sum_sin[u][tx] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
            //else 
                sum_sin[u][tx] = sum_sines/GPU_ARRAY_SIZE;

            //if (sum_cosines < 0) 
                //sum_cos[u][tx] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
            //else 
                sum_cos[u][tx] = sum_cosines/GPU_ARRAY_SIZE;

        }

        __syncthreads();
    }

    spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2) 
                               +(double)pow(sum_cos[tx][ty],2));
    __syncthreads();


    block[tx][ty] = spectrum[tx][ty];


    __syncthreads();


    //for every row!

    for(u=0; u<THREAD_SIZE; u++)
    {

        /* All threads calculate its own sin and cos value. */
        local_sin[tx][ty] = block[tx][ty] * sin((angle*ty)*u);
        local_cos[tx][ty] = block[tx][ty] * cos((angle*ty)*u);


        /* Only one column is activate. The thread in colum adds all element of its row. */
        if (tx == u) 
        {
            sum_sines   = 0.0;
            sum_cosines = 0.0;

            for (x=0; x<THREAD_SIZE; x++)
            {
                sum_sines   += local_sin[x][ty];
                sum_cosines += local_cos[x][ty];
            }

            //if (sum_sines < 0) 
                //sum_sin[ty][u] = ((-1)*sum_sines)/GPU_ARRAY_SIZE;
            //else 
                sum_sin[ty][u] = sum_sines/GPU_ARRAY_SIZE;

            //if (sum_cosines < 0) 
                //sum_cos[ty][u] = ((-1)*sum_cosines)/GPU_ARRAY_SIZE;
            //else 
                sum_cos[ty][u] = sum_cosines/GPU_ARRAY_SIZE;

        }

        __syncthreads();
    }

    spectrum[tx][ty] = sqrt((double)pow(sum_sin[tx][ty],2)+(double)pow(sum_cos[tx][ty],2));
    __syncthreads();


        /* Transpose! I think this is not necessary part. */

    d_osub_matrix[xIndex*GPU_ARRAY_SIZE + yIndex] =  spectrum[threadIdx.y][threadIdx.x];

    __syncthreads();
}

主要功能：

int main(int argc, char** argv)
{

    int i,j, w, h, sw, sh;

    int numSubblock = CPU_ARRAY_SIZE / GPU_ARRAY_SIZE;
        double *d_isub_matrix,*d_osub_matrix;

    double *big_matrix  = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];
    double *big_matrix2 = new double[CPU_ARRAY_SIZE*CPU_ARRAY_SIZE];

    double *isub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
    double *osub_matrix = new double[GPU_ARRAY_SIZE*GPU_ARRAY_SIZE];
    cudaEvent_t  start,stop;
    float elapsedtime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    for (i=0; i<CPU_ARRAY_SIZE; i++)
    {
        for (j=0; j<CPU_ARRAY_SIZE; j++)
        big_matrix[i*CPU_ARRAY_SIZE + j] = rand();//i*CPU_ARRAY_SIZE + j;
    }   



    cudaEventRecord(start,0);


    //cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);
    //cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)*2);

    for(i = 0; i < numSubblock; i++) 
    {
        for (j=0; j < numSubblock; j++) 
        {


        // start position of subarea of big array
        cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
        cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));

        h = i*GPU_ARRAY_SIZE;

        w = j*GPU_ARRAY_SIZE;
        //printf("h = %d, w=%d",h,w);
        //system("PAUSE");

        // move subarea of big array into isub array.

        for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
        {
            for (sw = 0; sw <GPU_ARRAY_SIZE; sw++) 
            {
            isub_matrix[sh*GPU_ARRAY_SIZE+sw] = big_matrix[(h+sh)*CPU_ARRAY_SIZE + (w+sw)];

            }
        }



            cudaMemcpy(d_isub_matrix,isub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyHostToDevice);

        //call the cuda kernel
        dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
        dim3 threads(THREAD_SIZE, THREAD_SIZE);

            twiddle_factor<<<blocks, threads>>>(d_isub_matrix,d_osub_matrix);

        cudaMemcpy(osub_matrix,d_osub_matrix,((GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);


        for (sh = 0; sh < GPU_ARRAY_SIZE; sh++)
        {
            for (sw = 0; sw <GPU_ARRAY_SIZE; sw++)
            {
                big_matrix2[(h+sh)*CPU_ARRAY_SIZE + (w+sw)] = osub_matrix[sh*GPU_ARRAY_SIZE+sw];
                printf(" sh %d  sw %d  %lf  \n", sh, sw, osub_matrix[sh*GPU_ARRAY_SIZE+sw]);

            }

        }
        printf("passei por aqui algumas vezes\n");
        cudaFree(d_osub_matrix);
        cudaFree(d_isub_matrix);

      }
    }
//  cudaFree(d_osub_matrix);
//  cudaFree(d_isub_matrix);

        //Stop the time
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsedtime,start,stop);

    //showing the processing time
    printf("The processing time took... %fms to execute everything",elapsedtime);
    system("PAUSE");

        for (sh = 0; sh < CPU_ARRAY_SIZE; sh++)
        {
            for (sw = 0; sw <CPU_ARRAY_SIZE; sw++)
            {

                printf(" sh %d  sw %d  %lf  \n", sh, sw, big_matrix2[sh*CPU_ARRAY_SIZE+sw]);

            }
        }


    system("PAUSE");
    // I guess the result is "[1][0] = [1], [1][512] = [513], [513][0] = [524289], [513][512] = [524801]". 

}

Answer 1

简而言之，问题可能而且应该是下线：

// start position of subarea of big array
  cudaMalloc((void**)&d_isub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));
  cudaMalloc((void**)&d_osub_matrix,(GPU_ARRAY_SIZE*GPU_ARRAY_SIZE)*sizeof(float));

您只需为GPU中的双值分配少量内存。您的子矩阵每点分配4个字节，需要8个字节。

Cuda Exceptions

1 个答案: