我试图在cuda设备上并行编写一些代码,但是我遇到了问题,我的gfx驱动程序在程序运行时一直崩溃,但只有这个程序,其他cuda代码运行正常。并且它给出了一个错误的anwser,但我认为这是因为崩溃!
注意:它在Quadro 2000M GFX卡上被破坏了!
这是我的代码的并行版本。
#include <stdio.h>
#include <time.h>
#include <math.h>
#define N 16
#define threads 512
#define MaxBlocks 6500
__global__ void cudaCalculateBestPath(int *M, int *bestToDiagPathSum, int *bestFromDiagPathSum,
unsigned int *bestToDiagPathCode, unsigned int *bestFromDiagPathCode)
{
int x = ((unsigned int)1 << (N-1));
unsigned int currentPathCode = blockIdx.x * threads + threadIdx.x;
// This while is for if we are over the max amount of blocks
while(currentPathCode < x)
{
int test = ((unsigned int)1 << (N-1));
if(currentPathCode >= test)
return;
unsigned int ui = currentPathCode;
int toDiagPathSum = M[0];
int toDiagRow = 0;
int toDiagCol = 0;
int fromDiagPathSum = M[(N-1)*N+N-1];
int fromDiagRow = N-1;
int fromDiagCol = N-1;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
{
toDiagCol++; // horizontal move
fromDiagCol--;
}
else
{
toDiagRow++; // vertical move
fromDiagRow--;
}
toDiagPathSum += M[toDiagRow*N+toDiagCol];
fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
if (toDiagPathSum < bestToDiagPathSum[toDiagRow])
{
bestToDiagPathSum[toDiagRow] = toDiagPathSum;
bestToDiagPathCode[toDiagRow] = currentPathCode;
}
if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow])
{
bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
bestFromDiagPathCode[fromDiagRow] = currentPathCode;
}
// Next run
currentPathCode = blockDim.x + gridDim.x;
}
}
int main()
{
clock_t start = clock();
//--- create and initialize M matrix (including best path)
int M[N*N];
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++)
M[row*N+col] = 2;
#pragma region Test Path
M[ 0*N+0] = 1;
M[ 0*N+1] = 1;
M[ 0*N+2] = 1;
M[ 1*N+2] = 1;
M[ 1*N+3] = 1;
M[ 2*N+3] = 1;
M[ 3*N+3] = 1;
M[ 3*N+4] = 1;
M[ 3*N+5] = 1;
M[ 3*N+6] = 1;
M[ 4*N+6] = 1;
M[ 5*N+6] = 1;
M[ 6*N+6] = 1;
M[ 7*N+6] = 1;
M[ 8*N+6] = 1;
M[ 9*N+6] = 1;
M[10*N+6] = 1;
M[10*N+7] = 1;
M[10*N+8] = 1;
M[10*N+9] = 1;
M[11*N+9] = 1;
M[11*N+0] = 1;
M[12*N+0] = 1;
M[13*N+10] = 1;
M[13*N+11] = 1;
M[13*N+12] = 1;
M[14*N+12] = 1;
M[15*N+12] = 1;
M[15*N+13] = 1;
M[15*N+14] = 1;
M[15*N+15] = 1;
#pragma endregion Test Path
//--- create and initialize bestToDiag and bestFromDiag arrays
int bestToDiagPathSum[N];
int bestFromDiagPathSum[N];
unsigned int bestToDiagPathCode[N];
unsigned int bestFromDiagPathCode[N];
int biggerThanMaxPathSum = 256*N + 1;
for (int i = 0; i < N; i++)
{
bestToDiagPathSum[i] = biggerThanMaxPathSum;
bestFromDiagPathSum[i] = biggerThanMaxPathSum;
}
//--- iterate through path codes, updating bestToDiag and bestFromDiag arrays
int x = ((unsigned int)1 << (N-1));
// Trick for celin the total blocks
int TotalBlocks = (x+threads)/threads;
if(TotalBlocks > MaxBlocks)
TotalBlocks = MaxBlocks;
int *dev_M, *dev_bestToDiagPathSum, *dev_bestFromDiagPathSum;
unsigned int *dev_bestToDiagPathCode, *dev_bestFromDiagPathCode;
// allocate memory
cudaMalloc((void**)&dev_M, N*N*sizeof(int));
cudaMalloc((void**)&dev_bestToDiagPathSum, N*sizeof(int));
cudaMalloc((void**)&dev_bestFromDiagPathSum, N*sizeof(int));
cudaMalloc((void**)&dev_bestToDiagPathCode, N*sizeof(int));
cudaMalloc((void**)&dev_bestFromDiagPathCode, N*sizeof(int));
// Copy memory to device
cudaMemcpy( dev_M, M, N*N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy( dev_bestToDiagPathSum, bestToDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy( dev_bestFromDiagPathSum, bestFromDiagPathSum, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy( dev_bestToDiagPathCode, bestToDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy( dev_bestFromDiagPathCode, bestFromDiagPathCode, N*sizeof(int), cudaMemcpyHostToDevice);
// Run code on device
printf("Blocks: %d\n", TotalBlocks);
printf("Threads: %d\n\n", threads);
cudaCalculateBestPath<<<TotalBlocks,threads>>>(dev_M, dev_bestToDiagPathSum, dev_bestFromDiagPathSum,
dev_bestToDiagPathCode, dev_bestFromDiagPathCode);
// Insert code here to run while the GPU is running.
// Copy the mem back
cudaMemcpy( M, dev_M, N*N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy( bestToDiagPathSum, dev_bestToDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy( bestFromDiagPathSum, dev_bestFromDiagPathSum, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy( bestToDiagPathCode, dev_bestToDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy( bestFromDiagPathCode, dev_bestFromDiagPathCode, N*sizeof(int), cudaMemcpyDeviceToHost);
int bestPathSum = biggerThanMaxPathSum;
unsigned int bestPathCodePrefix;
unsigned int bestPathCodeSuffix;
int tempSum;
for (int i = 0; i < N; i++)
{
tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
if (tempSum < bestPathSum)
{
bestPathSum = tempSum;
bestPathCodePrefix = bestToDiagPathCode[i];
bestPathCodeSuffix = bestFromDiagPathCode[i];
}
}
//--- output best path sum and best path diagram
printf("Best Path Sum = %d\n\n",bestPathSum);
M[0] = -M[0];
int toDiagRow = 0;
int toDiagCol = 0;
unsigned int ui = bestPathCodePrefix;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
toDiagCol++; // horizontal move
else
toDiagRow++; // vertical move
M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
ui = ui >> 1;
}
M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
int fromDiagRow = N-1;
int fromDiagCol = N-1;
ui = bestPathCodeSuffix;
for (int i = 0; i < N-2; i++)
{
if (ui % 2 == 0)
fromDiagCol--; // horizontal move
else
fromDiagRow--; // vertical move
M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
for (int row = N-1; row >= 0; row--)
{
for (int col = 0; col <= N-1; col++)
if (M[row*N+col] < 0)
{
printf("*");
M[row*N+col] = -M[row*N+col];
}
else
printf("%d",M[row*N+col]);
printf("\n");
}
printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
return 0;
}
顺序代码(完美工作)
#include <stdio.h>
#include <time.h>
#include <math.h>
#define N 16
#define threads 512
#define MaxBlocks 6500
int main()
{
clock_t start = clock();
//--- create and initialize M matrix (including best path)
int M[N*N];
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++)
M[row*N+col] = 2;
#pragma region Test Path
M[ 0*N+0] = 1;
M[ 0*N+1] = 1;
M[ 0*N+2] = 1;
M[ 1*N+2] = 1;
M[ 1*N+3] = 1;
M[ 2*N+3] = 1;
M[ 3*N+3] = 1;
M[ 3*N+4] = 1;
M[ 3*N+5] = 1;
M[ 3*N+6] = 1;
M[ 4*N+6] = 1;
M[ 5*N+6] = 1;
M[ 6*N+6] = 1;
M[ 7*N+6] = 1;
M[ 8*N+6] = 1;
M[ 9*N+6] = 1;
M[10*N+6] = 1;
M[10*N+7] = 1;
M[10*N+8] = 1;
M[10*N+9] = 1;
M[11*N+9] = 1;
M[11*N+0] = 1;
M[12*N+0] = 1;
M[13*N+10] = 1;
M[13*N+11] = 1;
M[13*N+12] = 1;
M[14*N+12] = 1;
M[15*N+12] = 1;
M[15*N+13] = 1;
M[15*N+14] = 1;
M[15*N+15] = 1;
#pragma endregion Test Path
//--- create and initialize bestToDiag and bestFromDiag arrays
int bestToDiagPathSum[N];
int bestFromDiagPathSum[N];
unsigned int bestToDiagPathCode[N];
unsigned int bestFromDiagPathCode[N];
int biggerThanMaxPathSum = 256*N + 1;
for (int i = 0; i < N; i++)
{
bestToDiagPathSum[i] = biggerThanMaxPathSum;
bestFromDiagPathSum[i] = biggerThanMaxPathSum;
}
//--- iterate through path codes, updating bestToDiag and bestFromDiag arrays
int toDiagPathSum, toDiagRow, toDiagCol;
int fromDiagPathSum, fromDiagRow, fromDiagCol;
unsigned int ui;
int x = ((unsigned int)1 << (N-1));
for(unsigned int currentPathCode = 0; currentPathCode < x; currentPathCode++)
{
ui = currentPathCode;
toDiagPathSum = M[0];
toDiagRow = 0;
toDiagCol = 0;
fromDiagPathSum = M[(N-1)*N+N-1];
fromDiagRow = N-1;
fromDiagCol = N-1;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
{
toDiagCol++; // horizontal move
fromDiagCol--;
}
else
{
toDiagRow++; // vertical move
fromDiagRow--;
}
toDiagPathSum += M[toDiagRow*N+toDiagCol];
fromDiagPathSum += M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
if (toDiagPathSum < bestToDiagPathSum[toDiagRow])
{
bestToDiagPathSum[toDiagRow] = toDiagPathSum;
bestToDiagPathCode[toDiagRow] = currentPathCode;
}
if (fromDiagPathSum < bestFromDiagPathSum[fromDiagRow])
{
bestFromDiagPathSum[fromDiagRow] = fromDiagPathSum;
bestFromDiagPathCode[fromDiagRow] = currentPathCode;
}
}
int bestPathSum = biggerThanMaxPathSum;
unsigned int bestPathCodePrefix;
unsigned int bestPathCodeSuffix;
int tempSum;
for (int i = 0; i < N; i++)
{
tempSum = bestToDiagPathSum[i] + bestFromDiagPathSum[i] - M[i*N+(N-1-i)];
if (tempSum < bestPathSum)
{
bestPathSum = tempSum;
bestPathCodePrefix = bestToDiagPathCode[i];
bestPathCodeSuffix = bestFromDiagPathCode[i];
}
}
//--- output best path sum and best path diagram
printf("Best Path Sum = %d\n\n",bestPathSum);
M[0] = -M[0];
toDiagRow = 0;
toDiagCol = 0;
ui = bestPathCodePrefix;
for (int i = 0; i < N-1; i++)
{
if (ui % 2 == 0)
toDiagCol++; // horizontal move
else
toDiagRow++; // vertical move
M[toDiagRow*N+toDiagCol] = -M[toDiagRow*N+toDiagCol];
ui = ui >> 1;
}
M[(N-1)*N+N-1] = -M[(N-1)*N+N-1];
fromDiagRow = N-1;
fromDiagCol = N-1;
ui = bestPathCodeSuffix;
for (int i = 0; i < N-2; i++)
{
if (ui % 2 == 0)
fromDiagCol--; // horizontal move
else
fromDiagRow--; // vertical move
M[fromDiagRow*N+fromDiagCol] = -M[fromDiagRow*N+fromDiagCol];
ui = ui >> 1;
}
for (int row = N-1; row >= 0; row--)
{
for (int col = 0; col <= N-1; col++)
if (M[row*N+col] < 0)
{
printf("*");
M[row*N+col] = -M[row*N+col];
}
else
printf("%d",M[row*N+col]);
printf("\n");
}
printf("\nTime elapsed: %f", ((double)clock() - start) / CLOCKS_PER_SEC);
int dummyReadForPause;
scanf_s("%d",&dummyReadForPause);
return 0;
}
答案 0 :(得分:2)
如果你有一个长时间运行的内核,Windows将停止你的驱动程序,以防止系统无响应。您需要执行一些注册以删除此功能。看看这里:http://msdn.microsoft.com/en-us/windows/hardware/gg487368.aspx