Question

我有一个__host__ __device__函数，该函数是包装器，它调用推力库的“ sort”函数。在此包装器内部，我使用__CUDA_ARCH__标志将执行策略设置为从主机调用时为“ thrust :: device”，而从设备调用时为“ thrust :: seq”。以下代码生成运行时错误-

#ifndef __CUDA_ARCH__
    thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
#else
    thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
#endif

错误是-

意外的标准异常： What（）是：merge_sort：在第二步失败：无效的设备功能

据我了解， CUDA_ARCH 可用于条件编译。我请求帮助以了解为什么会引发此错误。

Answer 1

似乎您正在踩this issue。简而言之，thrust在某些算法（包括排序）的幕后使用了CUB功能。您在代码中使用__CUDA_ARCH__宏的过程，其中包含使用CUB的推力算法调用，从而干扰了希望能够在所有路径中使用此宏的CUB代码。

可能的解决方法是执行“您自己的调度”：

$ cat t142.cu
#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>


template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
   return (t1 > t2);}
};

template <typename T>
__host__ __device__
void my_sort_wrapper(T *data, size_t num){
    int hostdev = 0;  // 0=device code
#ifndef __CUDA_ARCH__
    hostdev = 1;  // 1=host code
#endif
    if (hostdev == 0) thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
    else thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());

}

template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
  my_sort_wrapper(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
  mytype *d_data;
  cudaMalloc(&d_data, sz*sizeof(mytype));
  cudaMemset(d_data, 0, sz*sizeof(mytype));
  my_sort_wrapper(d_data, sz);
  my_dev_sort<<<1,1>>>(d_data, sz);
  cudaDeviceSynchronize();
}
$ nvcc t142.cu -o t142
$ cuda-memcheck ./t142
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

通过这种实现，使用__CUDA_ARCH__宏不会干扰推力算法的编译。

另一种可能的解决方法是在两种情况下都使用thrust::device策略（不分派-只是推力算法调用）。除CUDA动态并行性外，在设备代码中使用时，thrust::device会“衰减”到thrust::seq。

我希望这些建议仅在推力算法在基础实现中使用CUB功能时才是必要/相关。

如果您不喜欢这种行为，则可以提出thrust issue。

Answer 2

不幸的是，我们无法在Thrust中解决此问题。这里的问题在于，NVCC编译器需要在主机编译期间查看所有__global__函数模板实例化（例如，未定义__CUDA_ARCH__时），否则内核将被视为未使用并被丢弃。有关更多详细信息，请参见this CUB GitHub issue。

如Robert所建议的，这样的解决方法应该没问题：

#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
   return (t1 > t2);}
};

#if defined(__CUDA_ARCH__)
  #define DEVICE_COMPILATION 1
#else
  #define DEVICE_COMPILATION 0
#endif

template <typename T>
__host__ __device__
void my_sort(T *data, size_t num){
  if (DEVICE_COMPILATION)
    thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
  else
    thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
}

template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
  my_sort(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
  mytype *d_data;
  cudaMallocManaged(&d_data, sz*sizeof(mytype));
  cudaMemset(d_data, 0, sz*sizeof(mytype));
  my_sort(d_data, sz);
  my_dev_sort<<<1,1>>>(d_data, sz);
  cudaFree(d_data);
  cudaDeviceSynchronize();
}

具有推力执行策略的__CUDA_ARCH__标志

2 个答案: