Question

我遇到了一个问题，当试图使用cudaMallocManaged（）并在同一个CUDA 6应用程序中推力失败时，即使推力没有使用任何托管内存。简单地使用未使用的托管变量就足以导致推力失败。我创建了以下重现器，我在运行CUDA 6.0的NVIDIA Jetson TK1上测试：

#include "thrust/device_ptr.h"
#include "thrust/sort.h"

__global__ void calculate_hash(uint *hash_values, uint *particle_ids, int length)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    if(i >= length)
        return;

    hash_values[i] =  1;
    particle_ids[i] = i;
}

void hash_particles_gpu(uint *d_hash_values, uint *d_particle_ids, int length)
{
    int block_size = 256;
    int num_blocks = ceil(length/(float)block_size);

    calculate_hash<<<num_blocks, block_size>>>(d_hash_values, d_particle_ids, length);  

    cudaDeviceSynchronize();

    thrust::device_ptr<uint> keys(d_hash_values);
    thrust::device_ptr<uint> values(d_particle_ids);
    thrust::sort_by_key(keys, keys+length, values);
}

int main(int argc, char *argv[])
{
    int length = 15;
    int bytes;

    #ifdef BROKE
    int *m_int;
    cudaMallocManaged((void**)&m_int, sizeof(int));
    #endif

    // Allocate uint hash value array
    bytes = length*sizeof(unsigned int);
    unsigned int * hash_values;
    cudaMalloc((void**)&hash_values, bytes);    

    // Allocate uint particle ID array
    bytes = length*sizeof(unsigned int);
    unsigned int *particle_ids;
    cudaMalloc((void**)&particle_ids, bytes);

    hash_particles_gpu(hash_values, particle_ids, length);
}

当我编译并运行时：

$ nvcc -DBROKE -DTHRUST_DEBUG example.cu -o broke.exe
$ nvcc -DTHRUST_DEBUG example.cu -o fixed.exe
$ ./fixed.exe
$ ./broke.exe
terminate called after throwing an instance of 'thrust::system::system_error'
  what():  synchronize: RakingReduction: unknown error
Abort

我已经检查过以确保在此之前我没有任何错误，一切似乎都没问题，直到我调用sort_by_key。知道发生了什么事吗？

Answer 1

感谢您的评论。我为Tegra打了最新的Linux，19.3，现在它与Cuda 6.0一起工作。看起来NVIDIA在L4T 19.2上遇到了驱动程序问题。

CUDA 6托管内存的推力问题

1 个答案: