为什么我的CUDA内核崩溃(未指定的启动失败)具有不同的数据集大小?

时间:2016-04-28 00:41:57

标签: cuda

我有一个内核来根据它们的位置(对角线或非对角线)计算矩阵的不同元素。在计算大小矩阵时,内核按预期工作:

  • 14 x 14(我知道这很小,并没有正确使用GPU资源,但这纯粹是出于测试目的,以确保结果是正确的)
  • 118 x 118,
  • 300 x 300

但是,当我尝试计算大小为2383 x 2383的矩阵时,内核崩溃了。具体来说,错误"未指定的启动失败"抛出cudaMemcpy()行以将结果从设备返回到主机。从研究中我了解到,这种错误通常出现在越界内存访问的情况下(例如在阵列中),然而,我不能得到的是它适用于前三种情况但不适用于2383 x 2383案件。内核代码如下所示:

__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
    int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
    int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
    int index = rowIdx*numberOfBuses + colIdx;
    if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
    {
        for (int i=0; i<numberOfBranches; ++i)
        {
            if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
            if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
        }
    }
}

全局内存分配是通过调用cudaMalloc()完成的。代码中的分配如下:

cudaStat1 = cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
cudaStat2 = cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
cudaStat3 = cudaMalloc((void**)&dev_R, numLines*sizeof(float));
cudaStat4 = cudaMalloc((void**)&dev_X, numLines*sizeof(float));
cudaStat5 = cudaMalloc((void**)&dev_B, numLines*sizeof(float));
cudaStat6 = cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));
cudaStat7 = cudaMalloc((void**)&dev_Pd, numberOfBuses*sizeof(float));
cudaStat8 = cudaMalloc((void**)&dev_Qd, numberOfBuses*sizeof(float));
cudaStat9 = cudaMalloc((void**)&dev_Vmag, numberOfBuses*sizeof(float));
cudaStat10 = cudaMalloc((void**)&dev_theta, numberOfBuses*sizeof(float));
cudaStat11 = cudaMalloc((void**)&dev_Peq, numberOfBuses*sizeof(float));
cudaStat12 = cudaMalloc((void**)&dev_Qeq, numberOfBuses*sizeof(float));
cudaStat13 = cudaMalloc((void**)&dev_Peq1, numberOfBuses*sizeof(float));
cudaStat14 = cudaMalloc((void**)&dev_Qeq1, numberOfBuses*sizeof(float));
...
...
cudaStat15 = cudaMalloc((void**)&dev_powerMismatch, jacSize*sizeof(float));
cudaStat16 = cudaMalloc((void**)&dev_jacobian, jacSize*jacSize*sizeof(float));
cudaStat17 = cudaMalloc((void**)&dev_stateVector, jacSize*sizeof(float));
cudaStat18 = cudaMalloc((void**)&dev_PQindex, jacSize*sizeof(int));

其中cudaStatN的类型为cudaError_t以捕获错误。最后四个分配是在代码中稍后完成的,用于另一个内核。但是,这些分配是在调用相关内核之前完成的。

启动参数如下:

dim3 dimBlock(16, 16); //number of threads 
dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16);  //number of blocks

//launch kernel once data has been copied to GPU
createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);

//copy results back to CPU
cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
if (cudaStat6 != cudaSuccess) {
    cout<<"Device memcpy failed"<<endl;
    cout<<cudaGetErrorString(cudaStat6)<<endl;
    return 1;
}

我删除了时序代码,只是为了显示块和网格尺寸以及使用的错误检查技术。

我还有这个函数的主机(C ++代码)版本,我将数据传递给两个函数然后比较结果,首先是为了确保内核产生正确的结果,其次是在执行时间方面比较性能。我仔细检查了2383 x 2383情况的数据(它是从文本文件中读入并复制到全局内存中)并且我没有在数组访问/索引中发现任何异常。

我使用的是Visual Studio 2010,所以我尝试使用Nsight来查找错误(我不太熟悉Nsight)。摘要报告概述指出:&#34;报告了1个运行时API调用错误。 (有关详细信息,请参阅CUDA Runtime API调用报告)。在运行时API调用列表中,cudaMemcpy返回错误4 - 不确定线程​​ID(5012)是否在表中具有任何重要性 - 此数字随每次运行而变化。 CUDA memcheck工具(在命令行中)返回以下内容:

Thank you for using this program
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
=========     Saved host backtrace up to driver entry point at error
=========
========= ERROR SUMMARY: 1 error

我知道我的内核不是最有效的,因为有许多全局内存访问。为什么内核会崩溃这个更大的矩阵?我是否缺少一个超出范围的数组访问权限?非常感谢任何帮助。

2 个答案:

答案 0 :(得分:10)

解决了这个问题。结果是WDDM TDR(超时检测恢复)已启用,延迟设置为2秒。这意味着如果内核执行时间超过2秒,驱动程序将崩溃并恢复。这适用于图形和渲染(用于GPU的一般用途)。但是,在这种情况下,TDR必须禁用或延迟增加。通过将延迟增加到10秒,崩溃错误“未指定的启动失败”不再出现,并且内核执行继续像以前一样。

TDR延迟(以及启用/禁用)可以通过Nsight Monitor中的Nsight选项或通过注册表(HKEY_LOCAL_MACHINE \ SYSTEM \ CurrentControlSet \ Control \ GraphicsDrivers) - DWORDS Tdrdelay和Tdrlevel完成。

答案 1 :(得分:0)

我尝试使用以下完整示例重现您的代码。代码编译,运行没有错误。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "cuComplex.h"

__global__ void createYBus(float *R, float *X, float *B, int numberOfBuses, int numberOfBranches, int *fromBus, int *toBus, cuComplex *y)
{
    int rowIdx = blockIdx.y*blockDim.y + threadIdx.y;
    int colIdx = blockIdx.x*blockDim.x + threadIdx.x;
    int index = rowIdx*numberOfBuses + colIdx;
    if (rowIdx<numberOfBuses && colIdx<numberOfBuses)
    {
        for (int i=0; i<numberOfBranches; ++i)
        {
            if (rowIdx==fromBus[i] && colIdx==fromBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==toBus[i] && colIdx==toBus[i]) { //diagonal element
                y[index] = cuCaddf(y[index], make_cuComplex((R[i]/((R[i]*R[i])+(X[i]*X[i]))), (-(X[i]/((R[i]*R[i])+(X[i]*X[i])))+ (B[i]/2))));
            }
            if (rowIdx==fromBus[i] && colIdx==toBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
            if (rowIdx==toBus[i] && colIdx==fromBus[i]) { //off-diagonal element
                y[index] = make_cuComplex(-(R[i]/((R[i]*R[i])+(X[i]*X[i]))), X[i]/((R[i]*R[i])+(X[i]*X[i])));
            }
        }
    }
}


int main ()
{
    int numLines = 32 ;
    int numberOfBuses = 2383 ;

    int* dev_fromBus, *dev_toBus;
    float *dev_R, *dev_X, *dev_B;
    cuComplex* dev_y ; 

    cudaMalloc((void**)&dev_fromBus, numLines*sizeof(int));
    cudaMalloc((void**)&dev_toBus, numLines*sizeof(int));
    cudaMalloc((void**)&dev_R, numLines*sizeof(float));
    cudaMalloc((void**)&dev_X, numLines*sizeof(float));
    cudaMalloc((void**)&dev_B, numLines*sizeof(float));
    cudaMalloc((void**)&dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex));

    dim3 dimBlock(16, 16); //number of threads 
    dim3 dimGrid((numberOfBuses+15)/16, (numberOfBuses+15)/16);  //number of blocks

    //launch kernel once data has been copied to GPU
    createYBus<<<dimGrid, dimBlock>>>(dev_R, dev_X, dev_B, numberOfBuses, numLines, dev_fromBus, dev_toBus, dev_y);

    cuComplex* y_bus = new cuComplex[numberOfBuses*numberOfBuses] ;

    //copy results back to CPU
    cudaError_t cudaStat6 = cudaMemcpy(y_bus, dev_y, numberOfBuses*numberOfBuses*sizeof(cuComplex), cudaMemcpyDeviceToHost);
    if (cudaStat6 != cudaSuccess) {
        printf ("failure : (%d) - %s\n", cudaStat6, ::cudaGetErrorString(cudaStat6)) ;
        return 1;
    }
    return 0 ;
}

您的错误似乎在其他地方。

您希望在启用cuda mem check的NSIGHT调试模式下运行代码。如果使用调试信息进行编译,该工具应指出错误的位置。

编辑:问题似乎是由WDDM TDR引起的,如评论中所述。