如何使用CUDA Thrust执行策略来覆盖Thrust的低级设备内存分配器

我想覆盖低级CUDA设备内存分配器(实现为thrust :: system :: cuda :: detail :: malloc()),以便它使用自定义分配器而不是直接调用cudaMalloc()在主机(CPU)线程上调用。


struct eptCGA : thrust::system::cuda::detail::execution_policy<eptCGA>

/// overload the Thrust malloc() template function implementation
template<typename eptCGA> __host__ __device__ void* malloc( eptCGA, size_t n )
#ifndef __CUDA_ARCH__
    return MyMalloc( n );   /* (called from a host thread) */
    return NULL;            /* (called from a device GPU thread) */

/* called as follows, for example */
eptCGA epCGA;
thrust::remove_if( epCGA, ... );


    thrust::device_vector<UINT64> MyDeviceVector( ... );



#include <thrust/system/cuda/execution_policy.h>
#include <thrust/system/cuda/memory.h>
#include <thrust/system/cuda/vector.h>
#include <thrust/remove.h>

// create a custom execution policy by deriving from the existing cuda::execution_policy
struct my_policy : thrust::cuda::execution_policy<my_policy> {};

// provide an overload of malloc() for my_policy
__host__ __device__ void* malloc(my_policy, size_t n )
  printf("hello, world from my special malloc!\n");

  return thrust::raw_pointer_cast(thrust::cuda::malloc(n));

// create a custom allocator which will use our malloc
// we can inherit from cuda::allocator to reuse its existing functionality
template<class T>
struct my_allocator : thrust::cuda::allocator<T>
  using super_t = thrust::cuda::allocator<T>;
  using pointer = typename super_t::pointer;

  pointer allocate(size_t n)
    T* raw_ptr = reinterpret_cast<T*>(malloc(my_policy{}, sizeof(T) * n));

    // wrap the raw pointer in the special pointer wrapper for cuda pointers
    return pointer(raw_ptr);

template<class T>
using my_vector = thrust::cuda::vector<T, my_allocator<T>>;

int main()
  my_vector<int> vec(10, 13);

  assert(thrust::count(vec.begin(), vec.end(), 13) == 10);

  // because we're superstitious
  my_policy policy;
  auto new_end = thrust::remove(policy, vec.begin(), vec.end(), 13);
  vec.erase(new_end, vec.end());
  assert(vec.size() == 1);

  return 0;


$ nvcc -std=c++11 -I. test.cu -run
hello, world from my special malloc!
hello, world from my special malloc!
hello, world from my special malloc!
hello, world from my special malloc!
