Question

我很欣赏在内核初始化之后将bools d_unique数组从设备复制到主机时以下程序崩溃的原因。

我的GPU是Quadro K1000M（移动 - 计算能力3.0）。我正在使用CUDA第8版。

#include <iostream>

// nvcc -ccbin g++ -g -m64 -gencode arch=compute_30,code=sm_30 -o Bug Bug.cu

// Helper Functions Decl
void allocateDeviceMemory( void* devPtr , unsigned size , int lineNumber );
void copyDataToHost( void* hostPtr , void* devPtr , unsigned size , int lineNumber );
void copyDataToDevice( void* devPtr , void* hostPtr , unsigned size , int lineNumber );
void initializeDeviceMemory( void* devPtr , unsigned size , unsigned initValue , int lineNumber );

__global__ void myKernel( const ushort* __restrict__ dataPtr , const ushort* __restrict__ proxyId , bool* unique , unsigned size , const ushort dim )
{
    int N = threadIdx.x + ( blockIdx.x * blockDim.x );

    if( N < size - 1 )
    {
        unsigned offset;
        ushort countPtr = 0;
        ushort id1 = proxyId[N];
        ushort id2 = proxyId[N + 1];

        for( ushort i = 0; i < dim; ++i )
        {
            if( dataPtr[offset + id1] == dataPtr[offset + id2] ) ++countPtr;
            offset += size;
        }

        unique[N + 1] = ( countPtr != dim );    // No crash if commented out
    }
}

int main(int argc, char** argv)
{
    ushort dim = 2;
    static const unsigned SIZE = 10;

    ushort h_proxyId[SIZE] = { 6 , 3 , 1 , 0 , 7 , 4 , 2 , 8 , 5 , 9 };


    ushort h_dataPtr[ 2 * SIZE] = { 1 , 1 , 2 , 1 , 2 , 3 , 1 , 2 , 3 , 4 ,
                                    4 , 3 , 3 , 2 , 2 , 2 , 1 , 1 , 1 , 1 };

    ushort* d_proxyId = 0;
    ushort* d_dataPtr = 0;

    bool* d_unique = 0; 
    bool* h_unique = new bool[SIZE];

    allocateDeviceMemory( &d_unique , SIZE , __LINE__ );    
    allocateDeviceMemory( &d_proxyId , SIZE * sizeof(ushort) , __LINE__ );
    allocateDeviceMemory( &d_dataPtr , SIZE * sizeof(ushort) * 2 , __LINE__ );

    copyDataToDevice( d_proxyId , h_proxyId , SIZE * sizeof(ushort) , __LINE__ );
    copyDataToDevice( d_dataPtr , h_dataPtr , SIZE * sizeof(ushort) * 2 , __LINE__ );

    initializeDeviceMemory( d_unique , SIZE , 1 , __LINE__ );

    myKernel<<<1,SIZE>>>( d_dataPtr , d_proxyId , d_unique , SIZE , dim );  // No crash if commented out

    copyDataToHost( h_unique , d_unique , SIZE , __LINE__ );    // Crashes here

    return 0;
}

// Helper Functions Impl
void allocateDeviceMemory( void* devPtr , unsigned size , int lineNumber )
{
    cudaError_t error = cudaMalloc( (void**) devPtr , size );
    if( error != cudaSuccess )
    {
        std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to allocate device memory] " << cudaGetErrorString( error ) << std::endl;
        exit(-1);
    }
}

void initializeDeviceMemory( void* devPtr , unsigned size , unsigned initValue , int lineNumber  )
{
    cudaError_t error = cudaMemset( devPtr , initValue , size );
    if( error != cudaSuccess )
    {
        std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to initialize device memory to default value] " << cudaGetErrorString( error ) << std::endl;
        exit(-1);
    }        
}

void copyDataToHost( void* hostPtr , void* devPtr , unsigned size , int lineNumber  )
{
    cudaError_t error = cudaMemcpy( hostPtr , devPtr , size , cudaMemcpyDeviceToHost );
    if( error != cudaSuccess )
    {
        std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to copy device data to host memory] " << cudaGetErrorString( error ) << std::endl;
        exit(-1);
    }
}

void copyDataToDevice( void* devPtr , void* hostPtr , unsigned size , int lineNumber  )
{
    cudaError_t error = cudaMemcpy( devPtr , hostPtr , size , cudaMemcpyHostToDevice );
    if( error != cudaSuccess )
    {
        std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to copy host data to device memory] " << cudaGetErrorString( error ) << std::endl;
        exit(-1);
    }
}

我在二进制文件上运行cuda-memcheck，但它没有产生任何信息，即

ThinkPad-W530:~/tmp/CUDA/Prototype$ cuda-memcheck ./Bug
========= CUDA-MEMCHECK
[Line 59 -- Error 4 : Unable to copy device data to host memory] unspecified launch failure
========= Internal error (7)
========= No CUDA-MEMCHECK results found

Answer 1

它实际上可能不是cudaMemcpy的错误，而是你的内核启动。 Cuda错误是持久性的，并且您从内核启动而不是内存复制中获得潜在错误。您可以在内核之后运行cudaGetLastError（）来验证。

未指定的启动失败特定于内核。

从设备到主机的cudaMemcpy上的cudaErrorLaunchFailure

1 个答案: