Question

我已经实现了以下CUDA代码，但我对这种行为感到有点困惑。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>

#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1)) 

void PrintMatrix(float* a, int n)
{
    int j, i;
    for (j = 1; j <= n; j++)
    {
        for (i = 1; i <= n; i++)
        {
            printf("%7.0f", a[IDX2F(i, j, n)]);
        }
        printf("\n");
    }
}

float* CreateMatrix(int n)
{
    float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
    if (!matrix)
    {
        printf("host memory allocation failed");
        return nullptr;
    }

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            matrix[IDX2F(i, j, n)] = 2;
        }
    }

    return matrix;
}

long CudaMatrixMultiply(float* matrix, int n)
{
    cudaError_t cudaStat;
    cublasStatus_t status;
    cublasHandle_t handle;
    float* deviceMatrix;

    cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
    if (cudaStat != cudaSuccess)
    {
        printf("device memory allocation failed");
        return EXIT_FAILURE;
    }

    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("data download failed");
        cudaFree(deviceMatrix);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    float alpha = 1;
    float beta = 0;
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);

    status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("data upload failed");
        cudaFree(deviceMatrix);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    cudaFree(deviceMatrix);
    cublasDestroy(handle);
    return EXIT_SUCCESS;
}

float* CpuMatrixMultiply(float* matrix, int size)
{
    float* result = new float[size * size]();

    // Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
    for (int row = 1; row <= size; row++) 
    {
        for (int col = 1; col <= size; col++) 
        {
            // Multiply the row of A by the column of B to get the row, column of product.
            for (int inner = 1; inner <= size; inner++) 
            {
                // result[row][col] += matrix[row][inner] * matrix[inner][col];
                result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
            }
        }
    }

    free(matrix);
    return result;
}

int main(void)
{
    // printf("Matrix * Matrix Test\n");
    int size = 1000;
    int runs = 10;

    for (int run = 0; run != runs; run++)
    {
        printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
        printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);

        float* cpuMatrix = CreateMatrix(size);
        cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);

        PrintMatrix(cpuMatrix, 5);

        float* gpuMatrix = CreateMatrix(size);
        CudaMatrixMultiply(gpuMatrix, size);
        PrintMatrix(gpuMatrix, 5);

        free(cpuMatrix);
        free(gpuMatrix);
    }
    getchar();
    return EXIT_SUCCESS;
}

CPU 版本 MatrixMultiplication 的输出如下所示：

4000 4000 4000 4000 4000
     4000 4000 4000 4000 4000
     4000 4000 4000 4000 4000
     4000 4000 4000 4000 4000
     4000 4000 4000 4000 4000

但GPU计算的结果有时是正确的（见上文）或错误的随机（？）。第一次执行循环时，结果始终是正确的。

我无法在我的代码中发现错误，如果你能帮助我，那就太棒了。

另外，如果我将size（主方法中的int）设置为例如16000然后我的驱动程序崩溃，我收到一条错误消息。为此，我已经向NVidea写了一个错误报告，因为我的电脑崩溃了两次。但也许这是我的编程错误？

司机：364.72（最新的）
SDK：CUDA Toolkit 7.5
显卡：NVidia GeForce GTX 960（4GB）
Windows 10 64位

驱动程序错误

显示驱动程序NVIDIA Windows内核模式驱动程序，版本362.72停止响应并已成功恢复。

编辑：在社区的帮助下，我发现这是看门狗计时器的问题。请参阅下面的答案。

Answer 1

关于问题的第二部分，在njuffa的注释之后，您可以更改驱动程序行为的设置，以避免在增加大小时出错。打开NSIGHT Monitor，在选项，常规，Microsoft显示驱动程序中，更改为错误 启用WDDM TDR 字段。

从spec开始，32位FPU触发器的单精度应该在2.4 TFLOPS左右，因此对16000大小矩阵的操作应该至少需要3.5秒。因此2秒后驱动程序恢复。

来自cuBlas的不同结果

1 个答案: