
时间:2016-05-25 17:04:38

标签: cuda

(对不起我的英语)我研究CUDA。我在C CUDA中有关于热2D的代码,但是当我编译代码时会出现错误(不是语法错误)。此错误是"违规段"。我认为这是一个记忆容量,但我不确定,我不知道该怎么做。请帮帮我。

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <sys/time.h>
    #include <getopt.h>

    #include <cuda.h>

    #define MAXSTEP     1000
    #define CX          0.001
    #define CY          0.001

    #define NTHREADS    32

    void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey);
    void initializeArray         (double** X, unsigned long int sizex, unsigned long int sizey);

    double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey);
    void     free2DDoubleArray (double **X, unsigned long int size);
    void     save (double** X, unsigned long int x, unsigned long int y, char* filename);
    double   gettime(void);

     *  subroutine update - CUDA implementation

     __global__ void update(unsigned long int NXPROB, unsigned long int NYPROB, double *X1, double *X2) 
        long int i, j;
        long int CENTRE, NORD, SUD, EST, OEST;

        i = blockIdx.x * blockDim.x + threadIdx.x;
        j = blockIdx.y * blockDim.y + threadIdx.y;

        CENTRE = i + j*NXPROB;      //(i,j) - CENTER
        NORD = i + (j+1)*NXPROB;    //(i,j+1) - N
        SUD = i + (j-1)*NXPROB;     //(i,j-1) - S
        EST = (i+1) + j*NXPROB;     //(i+1,j) - E
        OEST = (i-1) + j*NXPROB;    //(i-1,j) - W

        //ERROR: expression must have pointer-to-object type 
        //  --> it's refering to X2 and X1
        /*if(i>0 && i<NXPROB && j>0 && j<NYPROB)
            X2[i][j] = X1[i][j] 
                    + CX * ( X1[i+1][j] + X1[i-1][j] - 2.0 * X1[i][j] )
                    + CY * ( X1[i][j+1] + X1[i][j-1] - 2.0 * X1[i][j] );

        //ALTERNATIVE 1:
        if(i>0 && i<NXPROB && j>0 && j<NYPROB)
            X2[CENTRE] = X1[CENTRE]
                    + CX * (X1[EST] + X1[OEST] - 2.0 * X1[CENTRE])
                    + CY * (X1[NORD] + X1[SUD] - 2.0 * X1[CENTRE]);

      * Main

    int main(int argc, char* argv[]) {
        long int s, it;
        unsigned int flag, verbose;
        unsigned int NX, NY, NXPROB, NYPROB;
        double start, end;
        int iz;

        // Defaut values
        NX = 100;
        NY = 100;

        // create file and verbose flags 
        flag = 0;
        verbose = 0;

        // Parse command line options 
        int opt;
        char *file = NULL;
        while ((opt = getopt(argc, argv, "hvs:f:")) != -1) {
            switch (opt) {
            case 'v': 
                verbose = 1; 
            case 's': 
                if( !(s=atoi(optarg)) )  {
                    fprintf(stderr, "Cannot parse %s value.\n", optarg);
                NX = NY = s; 
            case 'f': 
                file = optarg;
                flag = 1;
            case 'h':
                fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]);

        // Set initial data values  
        NXPROB = NX - 1;
        NYPROB = NY - 1;

        if(verbose) {
            fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY);  
            fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP);
        if(verbose && flag) {
            fprintf(stdout, "[INFO] Using output file %s\n", file);

        // Program starts here 
        start = gettime();

        // CPU Memory allocation        
        double** X[2];
        X[0] = make2DDoubleArray (NX, NY);  
        X[1] = make2DDoubleArray (NX, NY);  

        // Set initial and boundary conditions 
        initializeArray (X[0], NX, NY);
        setupBoundaryConditions(X[0], NX, NY);
        setupBoundaryConditions(X[1], NX, NY);

        // GPU Memory allocation
        double *d_X1, *d_X2;  
        cudaMalloc((void **)&d_X1, NX*NY*sizeof(double));   
        cudaMalloc((void **)&d_X2, NX*NY*sizeof(double));

        // Copy CPU --> GPU
        cudaMemcpy(d_X1, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice);
        cudaMemcpy(d_X2, X, NX*NY*sizeof(double), cudaMemcpyHostToDevice);

        dim3 dimBlock(NTHREADS,NTHREADS);
        dim3 dimGrid(1,1); 

        // Main calculations
        iz = 0;
        for (it = 0; it < MAXSTEP; it++) 
            if(verbose && (it%(MAXSTEP/10) == 0)) {
                fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start);
            // The first update has an error: d_X1 and d_X2
            //update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1[iz], d_X2[1-iz]);
            update<<<dimGrid, dimBlock>>>(NXPROB, NYPROB, d_X1, d_X2);
            iz = 1 - iz;

        // Copy GPU --> CPU
        cudaMemcpy(X, d_X1, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
        cudaMemcpy(X, d_X2, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);


        // Save output file
        if(flag) save(X[iz], NX, NY, file);

        free2DDoubleArray(X[0], NX);
        free2DDoubleArray(X[1], NX);

        // End time
        end = gettime();

        // Get information: wall clock time, problem size, ... 
            fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP);
            fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY);
            fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start));
            if(flag) fprintf(stdout, "[INFO] Output file  %s\n", file);
            printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY);



    void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) {
        /* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */
        unsigned long int i, j;
        double leftBC = 0, 
            rightBC = 0, 
            topBC = 0, 
            bottomBC = 0;

        /* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */
        for (i = 0; i < x; i++)
            X[i][0] = bottomBC; //bottom BC
            X[i][y-1] = topBC;  //top BC

        /* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */
        for (j = 0; j < y; j++) 
            X[0][j]   = leftBC;   //left BC
            X[x-1][j] = rightBC;  //right BC

        /* set the values at the corner nodes as averages of both sides*/
        // bottom left
        X[0][0]     = 0.5 * (leftBC + bottomBC);
        // top left
        X[0][y-1]   = 0.5 * (topBC + leftBC);
        // top right
        X[x-1][y-1] = 0.5 * (topBC + rightBC);
        // bottom right
        X[x-1][0]   = 0.5 * (bottomBC + rightBC);

    void initializeArray(double** X, unsigned long int x, unsigned long int y) {
        unsigned long int i, j;

        for (i = 1; i < x; i++) 
            for (j = 1; j < y; j++)
                X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1));

        for (i = 0; i < x; i++) 
            X[i][0] = 0;    //bottom BC
            X[i][y-1] = 0;  //top BC
        for (j = 1; j < y; j++)
            X[0][j]   = 0;  //left BC
            X[x-1][j] = 0;  //right BC

    void free2DDoubleArray(double **X, unsigned long int size) {
        unsigned long int i;
        for (i = 0; i < size; ++i) {

    double** make2DDoubleArray(unsigned long int x, unsigned long int y) {
            unsigned long int ix;
        double** X;
        X = (double**) malloc(x*sizeof(double*));
        for (ix = 0; ix < x; ix++) {
            X[ix] =(double*) malloc(y*sizeof(double));
        return X;

    void save(double** X, unsigned long int x, unsigned long int y, char* filename) {
        unsigned long int i, j;
        FILE* file;
        file = fopen(filename,"w");

        for (i = 0; i < x; i++) 
            for (j = 0; j < y; j++)
                fprintf(file,"%8.3f ", X[i][j]);

    /* Timing function */
    double gettime(void) {
        struct timeval tv;
        return tv.tv_sec + 1e-6*tv.tv_usec;

1 个答案:

答案 0 :(得分:-1)


// GPU Memory allocation
double *d_X1, *d_X2;  
cudaMalloc((void **)&d_X1, NX*NY*sizeof(double));   
cudaMalloc((void **)&d_X2, NX*NY*sizeof(double));


// GPU Memory allocation
double *d_X[2];  
cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double) );
cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double) );


    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <sys/time.h>
    #include <getopt.h>

    #include <cuda.h>

    #define MAXSTEP     1000
    #define CX          0.001
    #define CY          0.001

    #define NTHREADS    32

    void setupBoundaryConditions (double** X, unsigned long int sizex, unsigned long int sizey);
    void initializeArray         (double** X, unsigned long int sizex, unsigned long int sizey);

    double** make2DDoubleArray (unsigned long int sizex, unsigned long int sizey);
    void     save (double** X, unsigned long int x, unsigned long int y, char* filename);
    double   gettime(void);

     *  subroutine update - CUDA implementation

     __global__ void update(bool shared_memory, unsigned int NX, unsigned long int NXPROB, unsigned long int NYPROB, double *d_X1, double *d_X2) 
            long int i;
            long int CENTRE, NORD, SUD, EST, OEST;

            i = blockIdx.x * blockDim.x + threadIdx.x;

            CENTRE = i;             //(i,j)
            NORD = i + (NXPROB+1);  //(i,j+1)
            SUD = i + (NXPROB-1);   //(i,j-1)
            EST = i + 1;            //(i+1,j)
            OEST = i - 1;           //(i-1,j)

            if(i>NYPROB && i<NXPROB*NYPROB)
                d_X2[CENTRE] = d_X1[CENTRE]
                        + CX * (d_X1[EST] + d_X1[OEST] - 2.0 * d_X1[CENTRE])
                        + CY * (d_X1[NORD] + d_X1[SUD] - 2.0 * d_X1[CENTRE]);
        *   If you want execute this part, 
        *   then you put true to "shared_memory" in main 
            long int i, j, tx, ty, i2d;
            __shared__ double temp[NTHREADS][NTHREADS];
            double part_x, part_y;

            tx = threadIdx.x;
            ty = threadIdx.y;
            i = blockIdx.x * blockDim.x + tx;
            j = blockIdx.y * blockDim.y + ty;

            i2d = i + NXPROB*j;

            if(i2d < NXPROB*NYPROB)
                temp[tx][ty] = d_X1[i2d];


            if(i>NXPROB && i<NXPROB*NYPROB && j>NYPROB && j<NXPROB*NYPROB)
                if((tx>0) && (tx < NTHREADS-1) && tx < NTHREADS-1){
                    part_x = (temp[tx+1][ty] - 2*temp[tx][ty] + temp[tx-1][ty]);
                    part_y = (temp[tx][ty+1] - 2*temp[tx][ty] + temp[tx][ty-1]);
                d_X2[i2d] = d_X1[i2d] + CX*CY*(part_x + part_y);

      * Main

    int main(int argc, char* argv[]) {
        long int s, it;
        unsigned int flag, verbose;
        unsigned int NX, NY, NXPROB, NYPROB;
        double start, end;
        int iz;

        dim3 dimBlock, dimGrid;
        bool shared_memory = false;     //FALSE = no shared memory - TRUE = shared memory

        // Defaut values
        NX = 100;
        NY = 100;

        // create file and verbose flags 
        flag = 0;
        verbose = 0;

        // Parse command line options 
        int opt;
        char *file = NULL;
        while ((opt = getopt(argc, argv, "hvs:f:")) != -1) {
            switch (opt) {
            case 'v': 
                verbose = 1; 
            case 's': 
                if( !(s=atoi(optarg)) )  {
                    fprintf(stderr, "Cannot parse %s value.\n", optarg);
                NX = NY = s; 
            case 'f': 
                file = optarg;
                flag = 1;
            case 'h':
                fprintf(stderr, "Usage: %s [-s SIZE] [-f output file]\n", argv[0]);

        // Set initial data values  
        NXPROB = NX - 1;
        NYPROB = NY - 1;

        if(verbose) {
            fprintf(stdout, "[INFO] Setting map size to %d (%dx%d)\n", NX*NY, NX, NY);  
            fprintf(stdout, "[INFO] Max iter %d\n", MAXSTEP);
        if(verbose && flag) {
            fprintf(stdout, "[INFO] Using output file %s\n", file);

        // Program starts here 
        start = gettime();

        // CPU Memory allocation        
        double** X[2];
        X[0] = make2DDoubleArray (NX, NY);  
        X[1] = make2DDoubleArray (NX, NY);  

        // Set initial and boundary conditions 
        initializeArray (X[0], NX, NY);
        setupBoundaryConditions(X[0], NX, NY);
        setupBoundaryConditions(X[1], NX, NY);

        // GPU Memory allocation
        double *d_X[2];  
        cudaMalloc((void **)&d_X[0], NX*NY*sizeof(double) );
        cudaMalloc((void **)&d_X[1], NX*NY*sizeof(double) );

        // Copy CPU --> GPU
        cudaMemcpy(d_X[0], X[0], NX*NY*sizeof(double), cudaMemcpyHostToDevice);
        cudaMemcpy(d_X[1], X[1], NX*NY*sizeof(double), cudaMemcpyHostToDevice);

        dimBlock = dim3(NTHREADS,NTHREADS);
        dimGrid = dim3(ceil(NX/dimBlock.x),ceil(NY/dimBlock.y)); 

        // Main calculations
        iz = 0;
        for (it = 0; it < MAXSTEP; it++) 
            if(verbose && (it%(MAXSTEP/10) == 0)) {
                fprintf(stdout, "[INFO] iteration %ld, time %.3f seconds\n", it, gettime()-start);
            update<<<dimGrid, dimBlock>>>(shared_memory, NX, NXPROB, NYPROB, d_X[iz], d_X[1-iz]);
            iz = 1 - iz;


        // Copy GPU --> CPU
        //cudaMemcpy(X[iz], d_X[iz], NX*NY*sizeof(double), cudaMemcpyDeviceToHost);
        cudaMemcpy(X, d_X, NX*NY*sizeof(double), cudaMemcpyDeviceToHost);

        // Save output file
        if(flag) save(X[iz], NX, NY, file);

        // End time
        end = gettime();

        // Get information: wall clock time, problem size, ... 
            fprintf(stdout, "[INFO] Convergence after %d steps\n", MAXSTEP);
            fprintf(stdout, "[INFO] Problem size %d [%dx%d]\n", NY*NX, NX, NY);
            fprintf(stdout, "[INFO] Wall clock time %lf seconds\n",(end-start));
            if(flag) fprintf(stdout, "[INFO] Output file  %s\n", file);
            printf("Time %.3f seconds, Size %d [%dx%d]\n", end - start, NY*NX, NX, NY);



    void setupBoundaryConditions(double** X, unsigned long int x, unsigned long int y) {
        /* set boundary conditions for ix, jy = 0 and ix, jy = n-1 */
        unsigned long int i, j;
        double leftBC = 0, 
            rightBC = 0, 
            topBC = 0, 
            bottomBC = 0;

        /* setup the bottom and top BCs, jy = 0 and jy = n-1 or arraySizeY - 1 */
        for (i = 0; i < x; i++)
            X[i][0] = bottomBC; //bottom BC
            X[i][y-1] = topBC;  //top BC

        /* setup the left and right BCs, ix = 0 and ix = arraySizeX - 1 */
        for (j = 0; j < y; j++) 
            X[0][j]   = leftBC;   //left BC
            X[x-1][j] = rightBC;  //right BC

        /* set the values at the corner nodes as averages of both sides*/
        // bottom left
        X[0][0]     = 0.5 * (leftBC + bottomBC);
        // top left
        X[0][y-1]   = 0.5 * (topBC + leftBC);
        // top right
        X[x-1][y-1] = 0.5 * (topBC + rightBC);
        // bottom right
        X[x-1][0]   = 0.5 * (bottomBC + rightBC);

    void initializeArray(double** X, unsigned long int x, unsigned long int y) {
        unsigned long int i, j;

        for (i = 1; i < x; i++) 
            for (j = 1; j < y; j++)
                X[i][j] = (double)MAXSTEP+(i * (x - i - 1) * j * (y - j - 1));

        for (i = 0; i < x; i++) 
            X[i][0] = 0;    //bottom BC
            X[i][y-1] = 0;  //top BC
        for (j = 1; j < y; j++)
            X[0][j]   = 0;  //left BC
            X[x-1][j] = 0;  //right BC

    double** make2DDoubleArray(unsigned long int x, unsigned long int y) {
            unsigned long int ix;
        double** X;
        X = (double**) malloc(x*sizeof(double*));
        for (ix = 0; ix < x; ix++) {
            X[ix] =(double*) malloc(y*sizeof(double));
        return X;

    void save(double** X, unsigned long int x, unsigned long int y, char* filename) {
        unsigned long int i, j;
        FILE* file;
        file = fopen(filename,"w");

        for (i = 0; i < x; i++) 
            for (j = 0; j < y; j++)
                fprintf(file,"%8.3f ", X[i][j]);

    /* Timing function */
    double gettime(void) {
        struct timeval tv;
        return tv.tv_sec + 1e-6*tv.tv_usec;
