来自CUDA程序的意外输出

时间:2014-01-04 15:22:34

标签: reference cuda output

我遇到了CUDA中的代码问题。代码编译正确但在程序完全运行时会产生意外的输出。

在这种情况下,步骤应该增加,直到达到stepcount。但是,只输出一步。究竟我做错了什么?

另外,如何引用特定的xcord,ycord或zcord。例如。在CPU代码中使用数组我可以通过xcord [1]来引用一个特定的元素。在CUDA的情况下,我使用xcord [threadidx.x]吗?

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

/* Kernal code */
    __global__  
void run(float *lvelox, float *lveloy, float *lveloz, float *xcord, float *ycord, float *zcord)
{
    lveloy[threadIdx.x] = lveloy[threadIdx.x] - 9.81;
    xcord[threadIdx.x] = xcord[threadIdx.x] + lvelox[threadIdx.x];
    ycord[threadIdx.x] = ycord[threadIdx.x] + lveloy[threadIdx.x];
    zcord[threadIdx.x] = zcord[threadIdx.x] + lveloz[threadIdx.x];

}

/* Host code */
int main(void) {
    FILE *ofp;
    char *mode = "r";
    char outputFilename[] = "Output.txt";
    float dlvelox;
    float dlveloy;
    float dlveloz;
    float lvelox[1000] = {};
    float lveloy[1000] = {};
    float lveloz[1000] = {};
    float xcord[1000] = {};
    float ycord[1000] = {};
    float zcord[1000] = {};
    int fp;
    int id;
    int stepcount;
    int step = 0;

    {
        ofp = fopen(outputFilename, "w");

        if (ofp == NULL) {
            fprintf(stderr, "Can't open output file %s!\n", outputFilename);
            exit(1);
        }


        /* Obtaining velocity */
        {
            printf("Enter the initial horizontal velocity of the balls:\n");
            scanf("%f", &dlvelox);
            fprintf(ofp, "Initial horizontal velocity: %f\n", dlvelox);
            printf("Enter the initial vertical velocity of the balls:\n");
            scanf("%f", &dlveloy);
            fprintf(ofp, "Initial vertical velocity: %f\n", dlveloy);
            printf("Enter the initial Z velocity of the balls:\n");
            scanf("%f", &dlveloz);
            fprintf(ofp, "Initial Z velocity: %f\n", dlveloz);
        }

        for (int i = 0; i < 1000; i++)
            lvelox[i] = dlvelox;
        for (int i = 0; i < 1000; i++)
            lveloy[i] = dlveloy;
        for (int i = 0; i < 1000; i++)
            lveloz[i] = dlveloz;

        /* Obtain number of steps */
        {
            printf("Enter the number of steps wanted:\n");
            scanf("%d", &stepcount);
            fprintf(ofp, "Number of steps: %d\n", stepcount);
        }

        /* Initial console display */
        {
            fprintf(ofp, "\n");
            fprintf(ofp, "X-cord, Y-cord, Z-cord, Horizontal Velo, Vertical Velo, Z Velo, Ball ID, Step\n");
            fprintf(ofp, "\n");

        }

        /* GPU setup */

        float *lveloxd;
        float *lveloyd;
        float *lvelozd;
        float *xcordd;
        float *ycordd;
        float *zcordd;
        int *stepd;
        const int fsize = 1000*sizeof(float);
        const int isize = 1000*sizeof(int);

        /* Loop method */
        while ( step < stepcount )
        {    
            /* Memory allocation and copying to GPU */
            cudaMalloc( (void**)&lveloxd, fsize ); 
            cudaMalloc( (void**)&lveloyd, fsize );
            cudaMalloc( (void**)&lvelozd, fsize );
            cudaMalloc( (void**)&xcordd, fsize );
            cudaMalloc( (void**)&ycordd, fsize );
            cudaMalloc( (void**)&zcordd, fsize );

            cudaMemcpy( lveloxd, lvelox, fsize, cudaMemcpyHostToDevice ); 
            cudaMemcpy( lveloyd, lveloy, fsize, cudaMemcpyHostToDevice ); 
            cudaMemcpy( lvelozd, lveloz, fsize, cudaMemcpyHostToDevice ); 
            cudaMemcpy( xcordd, xcord, fsize, cudaMemcpyHostToDevice );
            cudaMemcpy( ycordd, ycord, fsize, cudaMemcpyHostToDevice ); 
            cudaMemcpy( zcordd, zcord, fsize, cudaMemcpyHostToDevice );

            /* Perform ACTUAL LOOP */
            dim3 dimBlock( 1000  );  
            dim3 dimGrid ( 1  );
            run<<<dimGrid, dimBlock>>>(lveloxd, lveloyd, lvelozd, xcordd, ycordd, zcordd);

            /* Copy back the data */
            cudaMemcpy( lvelox, lveloxd, fsize, cudaMemcpyDeviceToHost ); 
            cudaMemcpy( lveloy, lveloyd, fsize, cudaMemcpyDeviceToHost ); 
            cudaMemcpy( lveloz, lvelozd, fsize, cudaMemcpyDeviceToHost ); 
            cudaMemcpy( xcord, xcordd, fsize, cudaMemcpyDeviceToHost ); 
            cudaMemcpy( ycord, ycordd, fsize, cudaMemcpyDeviceToHost ); 
            cudaMemcpy( zcord, zcordd, fsize, cudaMemcpyDeviceToHost );   
            cudaFree( lveloxd );
            cudaFree( lveloyd );
            cudaFree( lvelozd );
            cudaFree( xcordd );
            cudaFree( ycordd );
            cudaFree( zcordd );

            fprintf(ofp, "%f, %f, %f, %f, %f, %f, %d\n", xcord, ycord, zcord, lvelox, lveloy, lveloz, step);       

            step = step + 1;

            if ( step == stepcount )
            {
                return 0;
            }

        }

        fclose(ofp);
    }
}

1 个答案:

答案 0 :(得分:0)

如果是由于内核中的SEGFAULT,则可能的答案如下:

您正在分配1000个实体的数组,并启动一个包含1000个线程的块。由于cuda的粒度为32个线程(warpsize)而你需要1000个线程,因此将启动1024个线程作为ceil(1000/32)= 32个warps = 1024个线程。

现在因为你只有1000个元素的数组,你没有请求但产生的24个线程将访问外部分配的内存,导致SEGFAULT,即lveloy[threadIdx.x]其中threadIdx.x = 1000-1023

为了防止这种情况,请将其包括在内:

if(threadIdx.x < 1000) {
    lveloy[threadIdx.x] = lveloy[threadIdx.x] - 9.81;
    xcord[threadIdx.x] = xcord[threadIdx.x] + lvelox[threadIdx.x];
    ycord[threadIdx.x] = ycord[threadIdx.x] + lveloy[threadIdx.x];
    zcord[threadIdx.x] = zcord[threadIdx.x] + lveloz[threadIdx.x];
}
  

另外,如何引用特定的xcord,ycord或zcord。   例如。在CPU代码中使用数组我可以引用一个特定的元素   xcord [1]。在CUDA的情况下,我使用xcord [threadidx.x]吗?

您可以像在CPU上一样访问它,通过文字xcord [1]或依赖于线程xcord [threadidx.x]或两者都作为xcord [threadidx.x + 1](不推荐由于非合并的记忆)