Question

我试图在cuda设备上并行编写一些代码，但是我遇到了问题，我的gfx驱动程序在程序运行时一直崩溃，但只有这个程序，其他cuda代码运行正常。并且它给出了一个错误的anwser，但我认为这是因为崩溃！

注意：它在Quadro 2000M GFX卡上被破坏了！

这是我的代码的并行版本。

#include <stdio.h>
#include <time.h>
#include <math.h>

#define N 16
#define threads 512
#define MaxBlocks 6500

__global__ void cudaCalculateBestPath(int *M, int *bestToDiagPathSum, int *bestFromDiagPathSum, 
        unsigned int *bestToDiagPathCode, unsigned int *bestFromDiagPathCode)
{
    int x = ((unsigned int)1 << (N-1));
    unsigned int currentPathCode = blockIdx.x * threads + threadIdx.x;
    // This while is for if we are over the max amount of blocks
    while(currentPathCode < x)
    {
        int test = ((unsigned int)1 << (N-1));
        if(currentPathCode >= test)
            return;

        unsigned int ui = currentPathCode;

        int toDiagPathSum = M[0];
        int toDiagRow = 0; 
        int toDiagCol = 0;

        int fromDiagPathSum = M[(N-1)*N+N-1];
        int fromDiagRow = N-1;
        int fromDiagCol = N-1;

        for (int i = 0; i < N-1; i++) 
        {
            if (ui % 2 == 0) 
            {
                toDiagCol++;                 // horizontal move
                fromDiagCol--;
            }
            else                            
            {
                toDiagRow++;                 // vertical move
                fromDiagRow--;
            }
            toDiagPathSum += M[toDiagRow*N+toDiagCol];
            fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
            ui = ui >> 1;
        }

        if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
        {
            bestToDiagPathSum[toDiagRow] = toDiagPathSum;
            bestToDiagPathCode[toDiagRow] = currentPathCode; 
        }

        if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
        {
            bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
            bestFromDiagPathCode[fromDiagRow] = currentPathCode; 
        }

        // Next run
        currentPathCode = blockDim.x + gridDim.x;
    }
}

int main()
{
    clock_t start = clock();

    //--- create and initialize M matrix (including best path) 

    int M[N*N];
    for (int row = 0; row < N; row++)
        for (int col = 0; col < N; col++)
            M[row*N+col] = 2;

#pragma region Test Path
    M[ 0*N+0] = 1;
    M[ 0*N+1] = 1;
    M[ 0*N+2] = 1;
    M[ 1*N+2] = 1;
    M[ 1*N+3] = 1;
    M[ 2*N+3] = 1;
    M[ 3*N+3] = 1;
    M[ 3*N+4] = 1;
    M[ 3*N+5] = 1;
    M[ 3*N+6] = 1;
    M[ 4*N+6] = 1;
    M[ 5*N+6] = 1;
    M[ 6*N+6] = 1;
    M[ 7*N+6] = 1;
    M[ 8*N+6] = 1;
    M[ 9*N+6] = 1;
    M[10*N+6] = 1;
    M[10*N+7] = 1;
    M[10*N+8] = 1;
    M[10*N+9] = 1;
    M[11*N+9] = 1;
    M[11*N+0] = 1;
    M[12*N+0] = 1;
    M[13*N+10] = 1;
    M[13*N+11] = 1;
    M[13*N+12] = 1;
    M[14*N+12] = 1;
    M[15*N+12] = 1;
    M[15*N+13] = 1;
    M[15*N+14] = 1;
    M[15*N+15] = 1;
#pragma endregion Test Path

    //--- create and initialize bestToDiag and bestFromDiag arrays

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N];

    unsigned int bestToDiagPathCode[N];
    unsigned int bestFromDiagPathCode[N];

    int biggerThanMaxPathSum = 256*N + 1;
    for (int i = 0; i < N; i++)
    {
        bestToDiagPathSum[i]   = biggerThanMaxPathSum; 
        bestFromDiagPathSum[i] = biggerThanMaxPathSum;
    }

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays

    int x = ((unsigned int)1 << (N-1));

    // Trick for celin the total blocks
    int TotalBlocks = (x+threads)/threads;
    if(TotalBlocks > MaxBlocks)
        TotalBlocks = MaxBlocks;

    int *dev_M, *dev_bestToDiagPathSum, *dev_bestFromDiagPathSum;
    unsigned int *dev_bestToDiagPathCode, *dev_bestFromDiagPathCode;

    // allocate memory
    cudaMalloc((void**)&dev_M, N*N*sizeof(int));
    cudaMalloc((void**)&dev_bestToDiagPathSum, N*sizeof(int));
    cudaMalloc((void**)&dev_bestFromDiagPathSum, N*sizeof(int));
    cudaMalloc((void**)&dev_bestToDiagPathCode, N*sizeof(int));
    cudaMalloc((void**)&dev_bestFromDiagPathCode, N*sizeof(int));

    // Copy memory to device
    cudaMemcpy( dev_M, M, N*N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestToDiagPathSum, bestToDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestFromDiagPathSum, bestFromDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestToDiagPathCode, bestToDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy( dev_bestFromDiagPathCode, bestFromDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);

    // Run code on device
    printf("Blocks: %d\n", TotalBlocks);
    printf("Threads: %d\n\n", threads);
    cudaCalculateBestPath<<<TotalBlocks,threads>>>(dev_M, dev_bestToDiagPathSum, dev_bestFromDiagPathSum, 
        dev_bestToDiagPathCode, dev_bestFromDiagPathCode);

    // Insert code here to run while the GPU is running.

    // Copy the mem back
    cudaMemcpy( M, dev_M, N*N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestToDiagPathSum, dev_bestToDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestFromDiagPathSum, dev_bestFromDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestToDiagPathCode, dev_bestToDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy( bestFromDiagPathCode, dev_bestFromDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);

    int bestPathSum = biggerThanMaxPathSum;
    unsigned int bestPathCodePrefix;
    unsigned int bestPathCodeSuffix;

    int tempSum;

    for (int i = 0; i < N; i++)
    {
        tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
        if (tempSum < bestPathSum)
        {
            bestPathSum = tempSum;
            bestPathCodePrefix = bestToDiagPathCode[i];
            bestPathCodeSuffix = bestFromDiagPathCode[i];
        }
    }

    //--- output best path sum and best path diagram

    printf("Best Path Sum = %d\n\n",bestPathSum);

    M[0] = -M[0];
    int toDiagRow = 0;
    int toDiagCol = 0;
    unsigned int ui = bestPathCodePrefix;
    for (int i = 0; i < N-1; i++) 
    {
        if (ui % 2 == 0) 
            toDiagCol++;                 // horizontal move
        else                            
            toDiagRow++;                 // vertical move
        M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
        ui = ui >> 1;
    }

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
    int fromDiagRow = N-1;
    int fromDiagCol = N-1;
    ui = bestPathCodeSuffix;
    for (int i = 0; i < N-2; i++) 
    {
        if (ui % 2 == 0) 
            fromDiagCol--;                 // horizontal move
        else                            
            fromDiagRow--;                 // vertical move
        M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
        ui = ui >> 1;
    }

    for (int row = N-1; row >= 0; row--)
    {
        for (int col = 0; col <= N-1; col++)
            if (M[row*N+col] < 0)
            {
                printf("*");
                M[row*N+col] = -M[row*N+col];
            }
            else
                printf("%d",M[row*N+col]); 
        printf("\n");
    }

    printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);

    int dummyReadForPause;
    scanf_s("%d",&dummyReadForPause);

    return 0;
}

顺序代码（完美工作）

#include <stdio.h>
#include <time.h>
#include <math.h>

#define N 16
#define threads 512
#define MaxBlocks 6500

int main()
{
    clock_t start = clock();

    //--- create and initialize M matrix (including best path) 

    int M[N*N];
    for (int row = 0; row < N; row++)
        for (int col = 0; col < N; col++)
            M[row*N+col] = 2;

#pragma region Test Path
    M[ 0*N+0] = 1;
    M[ 0*N+1] = 1;
    M[ 0*N+2] = 1;
    M[ 1*N+2] = 1;
    M[ 1*N+3] = 1;
    M[ 2*N+3] = 1;
    M[ 3*N+3] = 1;
    M[ 3*N+4] = 1;
    M[ 3*N+5] = 1;
    M[ 3*N+6] = 1;
    M[ 4*N+6] = 1;
    M[ 5*N+6] = 1;
    M[ 6*N+6] = 1;
    M[ 7*N+6] = 1;
    M[ 8*N+6] = 1;
    M[ 9*N+6] = 1;
    M[10*N+6] = 1;
    M[10*N+7] = 1;
    M[10*N+8] = 1;
    M[10*N+9] = 1;
    M[11*N+9] = 1;
    M[11*N+0] = 1;
    M[12*N+0] = 1;
    M[13*N+10] = 1;
    M[13*N+11] = 1;
    M[13*N+12] = 1;
    M[14*N+12] = 1;
    M[15*N+12] = 1;
    M[15*N+13] = 1;
    M[15*N+14] = 1;
    M[15*N+15] = 1;
#pragma endregion Test Path

    //--- create and initialize bestToDiag and bestFromDiag arrays

    int bestToDiagPathSum[N]; 
    int bestFromDiagPathSum[N];

    unsigned int bestToDiagPathCode[N];
    unsigned int bestFromDiagPathCode[N];

    int biggerThanMaxPathSum = 256*N + 1;
    for (int i = 0; i < N; i++)
    {
        bestToDiagPathSum[i]   = biggerThanMaxPathSum; 
        bestFromDiagPathSum[i] = biggerThanMaxPathSum;
    }

    //--- iterate through path codes, updating bestToDiag and bestFromDiag arrays
    int toDiagPathSum, toDiagRow, toDiagCol;
    int fromDiagPathSum, fromDiagRow, fromDiagCol;
    unsigned int ui;
    int x = ((unsigned int)1 << (N-1));

    for(unsigned int currentPathCode = 0; currentPathCode < x; currentPathCode++)
    {
        ui = currentPathCode;

        toDiagPathSum = M[0];
        toDiagRow = 0; 
        toDiagCol = 0;

        fromDiagPathSum = M[(N-1)*N+N-1];
        fromDiagRow = N-1;
        fromDiagCol = N-1;

        for (int i = 0; i < N-1; i++) 
        {
            if (ui % 2 == 0) 
            {
                toDiagCol++;                 // horizontal move
                fromDiagCol--;
            }
            else                            
            {
                toDiagRow++;                 // vertical move
                fromDiagRow--;
            }
            toDiagPathSum += M[toDiagRow*N+toDiagCol];
            fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
            ui = ui >> 1;
        }

        if (toDiagPathSum < bestToDiagPathSum[toDiagRow]) 
        {
            bestToDiagPathSum[toDiagRow] = toDiagPathSum;
            bestToDiagPathCode[toDiagRow] = currentPathCode; 
        }

        if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow]) 
        {
            bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
            bestFromDiagPathCode[fromDiagRow] = currentPathCode; 
        }
    }

    int bestPathSum = biggerThanMaxPathSum;
    unsigned int bestPathCodePrefix;
    unsigned int bestPathCodeSuffix;

    int tempSum;

    for (int i = 0; i < N; i++)
    {
        tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
        if (tempSum < bestPathSum)
        {
            bestPathSum = tempSum;
            bestPathCodePrefix = bestToDiagPathCode[i];
            bestPathCodeSuffix = bestFromDiagPathCode[i];
        }
    }

    //--- output best path sum and best path diagram

    printf("Best Path Sum = %d\n\n",bestPathSum);

    M[0] = -M[0];
    toDiagRow = 0;
    toDiagCol = 0;
    ui = bestPathCodePrefix;
    for (int i = 0; i < N-1; i++) 
    {
        if (ui % 2 == 0) 
            toDiagCol++;                 // horizontal move
        else                            
            toDiagRow++;                 // vertical move
        M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
        ui = ui >> 1;
    }

    M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
    fromDiagRow = N-1;
    fromDiagCol = N-1;
    ui = bestPathCodeSuffix;
    for (int i = 0; i < N-2; i++) 
    {
        if (ui % 2 == 0) 
            fromDiagCol--;                 // horizontal move
        else                            
            fromDiagRow--;                 // vertical move
        M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
        ui = ui >> 1;
    }

    for (int row = N-1; row >= 0; row--)
    {
        for (int col = 0; col <= N-1; col++)
            if (M[row*N+col] < 0)
            {
                printf("*");
                M[row*N+col] = -M[row*N+col];
            }
            else
                printf("%d",M[row*N+col]); 
        printf("\n");
    }

    printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);

    int dummyReadForPause;
    scanf_s("%d",&dummyReadForPause);

    return 0;
}

Answer 1

如果你有一个长时间运行的内核，Windows将停止你的驱动程序，以防止系统无响应。您需要执行一些注册以删除此功能。看看这里：http://msdn.microsoft.com/en-us/windows/hardware/gg487368.aspx

CUDA程序崩溃的驱动程序

1 个答案: