Question

（对不起我的英语）我研究CUDA。我在C CUDA中有关于热2D的代码，但是当我编译代码时会出现错误（不是语法错误）。此错误是＆＃34;违规段＆＃34;。我认为这是一个记忆容量，但我不确定，我不知道该怎么做。请帮帮我。

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <sys/time.h>
    #include <getopt.h>

    #include <cuda.h>

    #define MAXSTEP     1000
    #define CX          0.001
    #define CY          0.001

    #define NTHREADS    32

    void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey);
    void initializeArray         (double** X, unsigned long int sizex, unsigned long int sizey);

    double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey);
    void     free2DDoubleArray (double **X, unsigned long int size);
    void     save (double** X, unsigned long int x, unsigned long int y, char* filename);
    double   gettime(void);


    /*
     *  subroutine update - CUDA implementation
     */

     __global__ void update(unsigned long int NXPROB, unsigned long int NYPROB, double *X1, double *X2) 
     {
        long int i, j;
        long int CENTRE, NORD, SUD, EST, OEST;

        i = blockIdx.x * blockDim.x + threadIdx.x;
        j = blockIdx.y * blockDim.y + threadIdx.y;

        CENTRE = i + j*NXPROB;      //(i,j) - CENTER
        NORD = i + (j+1)*NXPROB;    //(i,j+1) - N
        SUD = i + (j-1)*NXPROB;     //(i,j-1) - S
        EST = (i+1) + j*NXPROB;     //(i+1,j) - E
        OEST = (i-1) + j*NXPROB;    //(i-1,j) - W

        //ALTERNATIVE0: THIS PART HAS AN ERROR 
        //ERROR: expression must have pointer-to-object type 
        //  --> it's refering to X2 and X1
        /*if(i>0 && i<NXPROB && j>0 && j<NYPROB)
        {
            X2[i][j] = X1[i][j] 
                    + CX * ( X1[i+1][j] + X1[i-1][j] - 2.0 * X1[i][j] )
                    + CY * ( X1[i][j+1] + X1[i][j-1] - 2.0 * X1[i][j] );
        }*/

        //ALTERNATIVE 1:
        if(i>0 && i<NXPROB && j>0 && j<NYPROB)
        {
            X2[CENTRE] = X1[CENTRE]
                    + CX * (X1[EST] + X1[OEST] - 2.0 * X1[CENTRE])
                    + CY * (X1[NORD] + X1[SUD] - 2.0 * X1[CENTRE]);
        }
        __syncthreads();    
     }

     /*
      * Main
      */ 

    int main(int argc, char* argv[]) {
        long int s, it;
        unsigned int flag, verbose;
        unsigned int NX, NY, NXPROB, NYPROB;
        double start, end;
        int iz;

        // Defaut values
        NX = 100;
        NY = 100;

        // create file and verbose flags 
        flag = 0;
        verbose = 0;

        // Parse command line options 
        int opt;
        char *file = NULL;
        while ((opt = getopt(argc, argv, "hvs:f:")) != -1) {
            switch (opt) {
            case 'v': 
                verbose = 1; 
                break;
            case 's': 
                if( !(s=atoi(optarg)) )  {
                    fprintf(stderr, "Cannot parse %s value.\n", optarg);
                    exit(EXIT_FAILURE);
                }
                NX = NY = s; 
                break;
            case 'f': 
                file = optarg;
                flag = 1;
                break;
            case 'h':
            default:
                fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]);
                exit(EXIT_FAILURE);
            }
        }


        // Set initial data values  
        NXPROB = NX - 1;
        NYPROB = NY - 1;

        if(verbose) {
            fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY);  
            fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP);
        }
        if(verbose && flag) {
            fprintf(stdout, "[INFO] Using output file %s\n", file);
        }

        // Program starts here 
        start = gettime();

        // CPU Memory allocation        
        double** X[2];
        X[0] = make2DDoubleArray (NX, NY);  
        X[1] = make2DDoubleArray (NX, NY);  

        // Set initial and boundary conditions 
        initializeArray (X[0], NX, NY);
        setupBoundaryConditions(X[0], NX, NY);
        setupBoundaryConditions(X[1], NX, NY);

        // GPU Memory allocation
        double *d_X1, *d_X2;  
        cudaMalloc((void **)&d_X1, NX*NY*sizeof(double));   
        cudaMalloc((void **)&d_X2, NX*NY*sizeof(double));

        // Copy CPU --> GPU
        cudaMemcpy(d_X1, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice);
        cudaMemcpy(d_X2, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice);

        dim3 dimBlock(NTHREADS,NTHREADS);
        dim3 dimGrid(1,1); 

        // Main calculations
        iz = 0;
        for (it = 0; it < MAXSTEP; it++) 
        {
            if(verbose && (it%(MAXSTEP/10) == 0)) {
                fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start);
            }
            // The first update has an error: d_X1 and d_X2
            //update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1[iz], d_X2[1-iz]);
            update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1, d_X2);
            iz = 1 - iz;
        }

        // Copy GPU --> CPU
        cudaMemcpy(X, d_X1, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
        cudaMemcpy(X, d_X2, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);

        //cudaThreadSynchronize();

        // Save output file
        if(flag) save(X[iz], NX, NY, file);

        free2DDoubleArray(X[0], NX);
        free2DDoubleArray(X[1], NX);

        // End time
        end = gettime();

        // Get information: wall clock time, problem size, ... 
        if(verbose)
        {
            fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP);
            fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY);
            fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start));
            if(flag) fprintf(stdout, "[INFO] Output file  %s\n", file);
        } 
        else 
        {
            printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY);
        }

        cudaFree(d_X1);
        cudaFree(d_X2);

        exit(EXIT_SUCCESS);
    }

    void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) {
        /* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */
        unsigned long int i, j;
        double leftBC = 0, 
            rightBC = 0, 
            topBC = 0, 
            bottomBC = 0;

        /* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */
        for (i = 0; i < x; i++)
        {
            X[i][0] = bottomBC; //bottom BC
            X[i][y-1] = topBC;  //top BC
        }   

        /* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */
        for (j = 0; j < y; j++) 
        {
            X[0][j]   = leftBC;   //left BC
            X[x-1][j] = rightBC;  //right BC
        }

        /* set the values at the corner nodes as averages of both sides*/
        // bottom left
        X[0][0]     = 0.5 * (leftBC + bottomBC);
        // top left
        X[0][y-1]   = 0.5 * (topBC + leftBC);
        // top right
        X[x-1][y-1] = 0.5 * (topBC + rightBC);
        // bottom right
        X[x-1][0]   = 0.5 * (bottomBC + rightBC);
    }


    void initializeArray(double** X, unsigned long int x, unsigned long int y) {
        unsigned long int i, j;

        for (i = 1; i < x; i++) 
        {
            for (j = 1; j < y; j++)
            { 
                X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1));
            }
        }

        for (i = 0; i < x; i++) 
        {
            X[i][0] = 0;    //bottom BC
            X[i][y-1] = 0;  //top BC
        }
        for (j = 1; j < y; j++)
        { 
            X[0][j]   = 0;  //left BC
            X[x-1][j] = 0;  //right BC
        }
    }

    void free2DDoubleArray(double **X, unsigned long int size) {
        unsigned long int i;
        for (i = 0; i < size; ++i) {
            free(X[i]);
        }
        free(X);
    }

    double** make2DDoubleArray(unsigned long int x, unsigned long int y) {
            unsigned long int ix;
        double** X;
        X = (double**) malloc(x*sizeof(double*));
        for (ix = 0; ix < x; ix++) {
            X[ix] =(double*) malloc(y*sizeof(double));
        }
        return X;
    }

    void save(double** X, unsigned long int x, unsigned long int y, char* filename) {
        unsigned long int i, j;
        FILE* file;
        file = fopen(filename,"w");

        for (i = 0; i < x; i++) 
        {
            for (j = 0; j < y; j++)
            {
                fprintf(file,"%8.3f ", X[i][j]);
            }
            fprintf(file,"\n");
        }
        fclose(file);
    }

    /* Timing function */
    double gettime(void) {
        struct timeval tv;
        gettimeofday(&tv,NULL);
        return tv.tv_sec + 1e-6*tv.tv_usec;
    }

Answer 1

内存容量错误是在设备中创建两个矩阵/阵列。这部分：

// GPU Memory allocation
double *d_X1, *d_X2;  
cudaMalloc((void **)&d_X1, NX*NY*sizeof(double));   
cudaMalloc((void **)&d_X2, NX*NY*sizeof(double));

解决方案：

// GPU Memory allocation
double *d_X[2];  
cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double) );
cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double) );

完整代码：

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <sys/time.h>
    #include <getopt.h>

    #include <cuda.h>

    #define MAXSTEP     1000
    #define CX          0.001
    #define CY          0.001

    #define NTHREADS    32

    void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey);
    void initializeArray         (double** X, unsigned long int sizex, unsigned long int sizey);

    double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey);
    void     save (double** X, unsigned long int x, unsigned long int y, char* filename);
    double   gettime(void);


    /*
     *  subroutine update - CUDA implementation
     */

     __global__ void update(bool shared_memory, unsigned int NX, unsigned long int NXPROB, unsigned long int NYPROB, double *d_X1, double *d_X2) 
     {
        //WITHOUT SHARED MEMORY - CODE THAT USES
        if(!shared_memory)  
        {
            long int i;
            long int CENTRE, NORD, SUD, EST, OEST;

            i = blockIdx.x * blockDim.x + threadIdx.x;

            CENTRE = i;             //(i,j)
            NORD = i + (NXPROB+1);  //(i,j+1)
            SUD = i + (NXPROB-1);   //(i,j-1)
            EST = i + 1;            //(i+1,j)
            OEST = i - 1;           //(i-1,j)

            if(i>NYPROB && i<NXPROB*NYPROB)
            {
                d_X2[CENTRE] = d_X1[CENTRE]
                        + CX * (d_X1[EST] + d_X1[OEST] - 2.0 * d_X1[CENTRE])
                        + CY * (d_X1[NORD] + d_X1[SUD] - 2.0 * d_X1[CENTRE]);
            }
        }
        /*  
        *   If you want execute this part, 
        *   then you put true to "shared_memory" in main 
        */
        //WITH SHARED MEMORY - SUBOPTIMAL (NOT MORE THAN SECUENCE CODE)
        else    
        {
            long int i, j, tx, ty, i2d;
            __shared__ double temp[NTHREADS][NTHREADS];
            double part_x, part_y;

            tx = threadIdx.x;
            ty = threadIdx.y;
            i = blockIdx.x * blockDim.x + tx;
            j = blockIdx.y * blockDim.y + ty;

            i2d = i + NXPROB*j;

            if(i2d < NXPROB*NYPROB)
                temp[tx][ty] = d_X1[i2d];

            __syncthreads();


            if(i>NXPROB && i<NXPROB*NYPROB && j>NYPROB && j<NXPROB*NYPROB)
            {
                if((tx>0) && (tx < NTHREADS-1) && tx < NTHREADS-1){
                    part_x = (temp[tx+1][ty] - 2*temp[tx][ty] + temp[tx-1][ty]);
                    part_y = (temp[tx][ty+1] - 2*temp[tx][ty] + temp[tx][ty-1]);
                }
                d_X2[i2d] = d_X1[i2d] + CX*CY*(part_x + part_y);
            }
        }
     }

     /*
      * Main
      */ 

    int main(int argc, char* argv[]) {
        long int s, it;
        unsigned int flag, verbose;
        unsigned int NX, NY, NXPROB, NYPROB;
        double start, end;
        int iz;

        dim3 dimBlock, dimGrid;
        bool shared_memory = false;     //FALSE = no shared memory - TRUE = shared memory

        // Defaut values
        NX = 100;
        NY = 100;

        // create file and verbose flags 
        flag = 0;
        verbose = 0;

        // Parse command line options 
        int opt;
        char *file = NULL;
        while ((opt = getopt(argc, argv, "hvs:f:")) != -1) {
            switch (opt) {
            case 'v': 
                verbose = 1; 
                break;
            case 's': 
                if( !(s=atoi(optarg)) )  {
                    fprintf(stderr, "Cannot parse %s value.\n", optarg);
                    exit(EXIT_FAILURE);
                }
                NX = NY = s; 
                break;
            case 'f': 
                file = optarg;
                flag = 1;
                break;
            case 'h':
            default:
                fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]);
                exit(EXIT_FAILURE);
            }
        }


        // Set initial data values  
        NXPROB = NX - 1;
        NYPROB = NY - 1;

        if(verbose) {
            fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY);  
            fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP);
        }
        if(verbose && flag) {
            fprintf(stdout, "[INFO] Using output file %s\n", file);
        }

        // Program starts here 
        start = gettime();

        // CPU Memory allocation        
        double** X[2];
        X[0] = make2DDoubleArray (NX, NY);  
        X[1] = make2DDoubleArray (NX, NY);  

        // Set initial and boundary conditions 
        initializeArray (X[0], NX, NY);
        setupBoundaryConditions(X[0], NX, NY);
        setupBoundaryConditions(X[1], NX, NY);

        // GPU Memory allocation
        double *d_X[2];  
        cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double) );
        cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double) );

        // Copy CPU --> GPU
        cudaMemcpy(d_X[0], X[0], NX*NY*sizeof(double), cudaMemcpyHostToDevice);
        cudaMemcpy(d_X[1], X[1], NX*NY*sizeof(double), cudaMemcpyHostToDevice);

        dimBlock = dim3(NTHREADS,NTHREADS);
        dimGrid = dim3(ceil(NX/dimBlock.x),ceil(NY/dimBlock.y)); 

        // Main calculations
        iz = 0;
        for (it = 0; it < MAXSTEP; it++) 
        {
            if(verbose && (it%(MAXSTEP/10) == 0)) {
                fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start);
            }
            update<<<dimGrid, dimBlock>>>(shared_memory, NX, NXPROB, NYPROB, d_X[iz], d_X[1-iz]);
            iz = 1 - iz;
        }

        cudaThreadSynchronize();

        // Copy GPU --> CPU
        //cudaMemcpy(X[iz], d_X[iz], NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
        cudaMemcpy(X, d_X, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);

        // Save output file
        if(flag) save(X[iz], NX, NY, file);

        // End time
        end = gettime();

        // Get information: wall clock time, problem size, ... 
        if(verbose)
        {
            fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP);
            fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY);
            fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start));
            if(flag) fprintf(stdout, "[INFO] Output file  %s\n", file);
        } 
        else 
        {
            printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY);
        }

        cudaFree(d_X);

        exit(EXIT_SUCCESS);
    }

    void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) {
        /* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */
        unsigned long int i, j;
        double leftBC = 0, 
            rightBC = 0, 
            topBC = 0, 
            bottomBC = 0;

        /* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */
        for (i = 0; i < x; i++)
        {
            X[i][0] = bottomBC; //bottom BC
            X[i][y-1] = topBC;  //top BC
        }   

        /* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */
        for (j = 0; j < y; j++) 
        {
            X[0][j]   = leftBC;   //left BC
            X[x-1][j] = rightBC;  //right BC
        }

        /* set the values at the corner nodes as averages of both sides*/
        // bottom left
        X[0][0]     = 0.5 * (leftBC + bottomBC);
        // top left
        X[0][y-1]   = 0.5 * (topBC + leftBC);
        // top right
        X[x-1][y-1] = 0.5 * (topBC + rightBC);
        // bottom right
        X[x-1][0]   = 0.5 * (bottomBC + rightBC);
    }


    void initializeArray(double** X, unsigned long int x, unsigned long int y) {
        unsigned long int i, j;

        for (i = 1; i < x; i++) 
        {
            for (j = 1; j < y; j++)
            { 
                X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1));
            }
        }

        for (i = 0; i < x; i++) 
        {
            X[i][0] = 0;    //bottom BC
            X[i][y-1] = 0;  //top BC
        }
        for (j = 1; j < y; j++)
        { 
            X[0][j]   = 0;  //left BC
            X[x-1][j] = 0;  //right BC
        }
    }

    double** make2DDoubleArray(unsigned long int x, unsigned long int y) {
            unsigned long int ix;
        double** X;
        X = (double**) malloc(x*sizeof(double*));
        for (ix = 0; ix < x; ix++) {
            X[ix] =(double*) malloc(y*sizeof(double));
        }
        return X;
    }

    void save(double** X, unsigned long int x, unsigned long int y, char* filename) {
        unsigned long int i, j;
        FILE* file;
        file = fopen(filename,"w");

        for (i = 0; i < x; i++) 
        {
            for (j = 0; j < y; j++)
            {
                fprintf(file,"%8.3f ", X[i][j]);
            }
            fprintf(file,"\n");
        }
        fclose(file);
    }

    /* Timing function */
    double gettime(void) {
        struct timeval tv;
        gettimeofday(&tv,NULL);
        return tv.tv_sec + 1e-6*tv.tv_usec;
    }

虽然结果可能会有一些错误。

解决CUDA中的热量二维问题

1 个答案: