如何使用thrust :: lower_bound()

时间:2015-07-06 17:50:34

标签: c++ cuda thrust

我使用GPU上的Marching cube(使用CUDA)生成网格。网格非常详细,粗略的顶点列表存储在GPU中,映射到CUDA数组float *d_vertexData的VBO中。数据顺序是顶点位置和正常交错,如下所示。

{v0x, v0y, v0z, n0x, n0y, n0z, v1x, v1y, v1z, n1x, n1y, n1z, ...}

网格的大小通常为34MB(500K三角形)~1400MB(20M三角形),并存储在GPU上。

然后我使用thrust::sort()thrust::unique删除重复的顶点并使用thrust::lower_bound()来计算索引。在该步骤之后,网眼尺寸减小70%或更多。下面的代码演示了这一步骤。

float exampleVerts[36]=
{ 1, 2, 3, 0, 1, 0, 4, 5, 6, 0, 1, 0, 7, 8, 9, 0, 1, 0, 1, 2, 3, 0, 1, 0,
4, 5, 6, 0, 1, 0, 10, 11, 12, 0, 1, 0};

unsingned int numVertices = 36;
cudaMalloc(void**(&d_vertexData), numVertices*sizeof(float));
cudaMemCpy( d_vertexData, exampleVerts, numVertices*sizeof(float), cudaMemcpyHostToDevice);

unsigned int data_size = numVertices * 6; //6 floats per vertex

thrust::device_ptr<float> vertsPtr = thrust::device_pointer_cast(d_vertexData);

thrust::device_vector<float> vertsCopy(vertsPtr, vertsPtr + data_size);
thrust::device_vector<unsigned int> indices(numVertices);

auto zip_vert_first = zip(...); // using vertsPtr and strided_range
auto zip_vert_last = zip(...); // using vertsPtr and strided_range

thrust::sort(zip_verts_first, zip_verts_last);
auto new_vert_last = thrust::unique(zip_vertex_first, zip_vertex_last);

auto zip_vertcopy_first = zip(...); //using vertsCopy.data() and strided_range
auto zip_vertcopy_last = zip(...); //using vertsCopy.data() and strided_range

//find index of each input vertex in the list of unique vertices
thrust::lower_bound(zip_vert_first, new_vert_last,
    zip_vertcopy_first, zip_vertcopy_last,
    indices.begin());

它有效,但内存需求量很大。此行thrust::device_vector<float> vertsCopy(vertsPtr, vertsPtr + data_size);需要[VBO size]内存来存储顶点副本,以便在thrust::lower_bound()中使用。

在我的应用程序中,对于粗略顶点列表,网格通常非常大,高达1.5GB。该方法具有以下限制。

  

它需要额外117%的VBO大小。   (所有顶点的副本为100%,索引为17%)

由于此限制,此方法无法在具有2GB或更低VRAM的GPU上运行。我正在使用带有4GB VRAM的GPU,即使这样我也很容易在我的应用程序中达到这个限制。

如果没有这么大的内存要求,还有其他方法可以在GPU上计算索引吗?否则我唯一的选择是回到CPU(主机),我认为这将非常慢。

1 个答案:

答案 0 :(得分:2)

如果对索引而不是顶点数据本身进行操作,则可以避免顶点的复制。

以下示例(基于我的answer to your previous question和我的回答here)执行以下步骤:

  1. 一步对顶点和索引进行排序
  2. 查找重复顶点的起始索引
  3. 根据这些起始索引删除重复的顶点
  4. 计算新指数
  5. 最终索引存储在d_indices_2中。

    <强>输出

    d_vertices:     1   2   3   4   5   6   7   8   9   4   5   6   7   8   9   0   1   2   
    d_indices:      0   1   2   3   4   5   
    d_vertices:     0   1   2   1   2   3   4   5   6   4   5   6   7   8   9   7   8   9   
    d_indices:      5   0   1   3   2   4   
    d_indices_2:    0   1   2   0   4   0   
    d_vertices:     0   1   2   1   2   3   4   5   6   7   8   9   
    d_indices_3:    0   1   2   2   3   3   
    d_indices_2:    1   2   3   2   3   0   
    
    #include <thrust/device_vector.h>
    #include <thrust/sort.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/copy.h>
    #include <thrust/sequence.h>
    #include <thrust/transform.h>
    #include <thrust/functional.h>
    #include <thrust/scan.h>
    #include <iostream>
    #include <thrust/tuple.h>
    #include <thrust/execution_policy.h>
    #include <thrust/scatter.h>
    #include <thrust/unique.h>
    #include <thrust/remove.h>
    #include <stdint.h>
    
    template<typename... Iterators>
    __host__ __device__
    thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
    {
        return thrust::make_zip_iterator(thrust::make_tuple(its...));
    }
    
    template <typename Iterator, typename thrust::iterator_difference<Iterator>::type stride>
    class strided_range
    {
    public:
        typedef typename thrust::iterator_difference<Iterator>::type difference_type;
    
        //template <difference_type stride>
        struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        {
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            { 
                return stride * i;
            }
        };
    
        typedef typename thrust::counting_iterator<difference_type>                           CountingIterator;
        typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
        typedef typename thrust::permutation_iterator<Iterator,TransformIterator>             PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_range(Iterator first, Iterator last)
            : first(first), last(last) {}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor()));
        }
    
        iterator end(void) const
        {
            return begin() + ((last - first) + (stride - 1)) / stride;
        }
    
    protected:
        Iterator first;
        Iterator last;
    };
    
    
    template<typename, typename>
    struct append_to_type_seq { };
    
    template<typename T, typename... Ts, template<typename...> class TT>
    struct append_to_type_seq<T, TT<Ts...>>
    {
        using type = TT<Ts..., T>;
    };
    
    template<typename T, unsigned int N, template<typename...> class TT>
    struct repeat
    {
        using type = typename
            append_to_type_seq<
                T,
                typename repeat<T, N-1, TT>::type
                >::type;
    };
    
    template<typename T, template<typename...> class TT>
    struct repeat<T, 0, TT>
    {
        using type = TT<>;
    };
    
    template<typename Tuple> struct std_to_thrust_tuple;
    template<typename...T> struct std_to_thrust_tuple<std::tuple<T...>> {
      using type = thrust::tuple<T...>;
    };
    
    template<typename IteratorType, std::size_t stride>
    class zipped_strided_range
    {
    public:
    
        typedef typename strided_range<IteratorType, stride>::iterator SingleIterator;
        typedef typename repeat<SingleIterator, stride, std::tuple>::type StdIteratorTuple;
        typedef typename std_to_thrust_tuple<StdIteratorTuple>::type IteratorTuple;
        typedef decltype(thrust::make_zip_iterator(IteratorTuple())) ZipIterator;
    
        zipped_strided_range(IteratorType first, IteratorType last) : first(first), last(last)
        {
            assign<0>();
        }
    
        ZipIterator begin() const
        {
            return thrust::make_zip_iterator(begin_tuple);
        }
    
        ZipIterator end() const
        {
            return thrust::make_zip_iterator(end_tuple);
        }
    
    protected:
    
        template <std::size_t index>
        void assign(typename std::enable_if< (index < stride) >::type* = 0)
        {
            strided_range<IteratorType,stride> strided_range_iterator(first+index, last-(stride-1)+index);
    
            thrust::get<index>(begin_tuple) = strided_range_iterator.begin();
            thrust::get<index>(end_tuple) = strided_range_iterator.end();
            assign<index+1>();
        }
    
        template <std::size_t index>
        void assign(typename std::enable_if< (index == stride) >::type* = 0)
        {
            // end recursion
        }
    
        IteratorType first;
        IteratorType last;
    
        IteratorTuple begin_tuple;
        IteratorTuple end_tuple;
    };
    
    
    
    
    #define PRINTER(name) print(#name, (name))
    template <template <typename...> class V, typename T, typename ...Args>
    void print(const char* name, const V<T,Args...> & v)
    {
        std::cout << name << ":\t";
        thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
        std::cout << std::endl;
    }
    
    
    template <typename IteratorType, typename IndexType = uint32_t>
    struct my_scatter : public thrust::unary_function<IndexType,IndexType>
    {
        my_scatter(IteratorType first) : first(first)
        {
        }
    
       __host__ __device__
       IndexType operator()(const IndexType& i)
       {
          IndexType result = i;
          if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
          { 
              result = static_cast<IndexType>(0);
          }
          return result;
       }
    
       IteratorType first;
    };
    
    template <typename IteratorType>
    my_scatter<IteratorType> make_my_scatter(IteratorType first)
    {
      return my_scatter<IteratorType>(first);
    }
    
    template <typename T>
    struct my_transformer : public thrust::unary_function<T,T>
    {
      __host__ __device__
      T operator()(const T& x) const 
      {
        return static_cast<bool>(x);
      }
    };
    
    
    int main()
    {
        using namespace thrust::placeholders;
    
        const int stride = 3;
        const int num = 6;
    
        const int size = stride * num;
    
        float values[size] = {1,2,3,
                              4,5,6,
                              7,8,9,
                              4,5,6,
                              7,8,9,
                              0,1,2
        };
    
    
        typedef uint32_t Integer;
    
        thrust::host_vector<float> h_vertices (values, values+size);
        thrust::device_vector<float> d_vertices = h_vertices;
        float* dev_ptr = thrust::raw_pointer_cast(d_vertices.data());
        zipped_strided_range<float*, stride> zipped(dev_ptr, dev_ptr+size);
    
        thrust::device_vector<Integer> d_indices(num);
        thrust::sequence(d_indices.begin(), d_indices.end());
    
        PRINTER(d_vertices);
        PRINTER(d_indices);
    
        // 1. sort 
        auto zip_begin = zip(zipped.begin(),d_indices.begin());
        auto zip_end   = zip(zipped.end(),d_indices.end());
        thrust::sort(thrust::device, zip_begin, zip_end);
        PRINTER(d_vertices);
        PRINTER(d_indices);
    
        thrust::device_vector<Integer> d_indices_2(num);
    
        // 2. find start indics of duplicate vertices
        auto my_scatter_op =  make_my_scatter(zipped.begin());
        thrust::transform(thrust::make_counting_iterator(static_cast<Integer>(0)),
                          thrust::make_counting_iterator(static_cast<Integer>(num)),
                          d_indices_2.begin(),
                          my_scatter_op);
        PRINTER(d_indices_2);
    
        // 3. remove duplicate vertices
        /*
        // unique could be used, but we already know which vertices we want
        auto new_end = thrust::unique(thrust::device, zipped.begin(), zipped.end());
        */
        auto new_end = thrust::remove_if(thrust::device, zipped.begin()+1, zipped.end()+1, d_indices_2.begin()+1, !_1);
        int new_size = (new_end - zipped.begin());
        d_vertices.resize(stride*new_size);
        PRINTER(d_vertices);
    
        thrust::device_vector<Integer> d_indices_3(num);
        auto transform_op = my_transformer<Integer>();
        auto t_b = thrust::make_transform_iterator(d_indices_2.begin()+1, transform_op);
        auto t_e = thrust::make_transform_iterator(d_indices_2.end(), transform_op);
        thrust::inclusive_scan(t_b, t_e, d_indices_3.begin()+1);
        PRINTER(d_indices_3);
    
        // 4. calculate final indices
        thrust::scatter(d_indices_3.begin(), d_indices_3.end(), d_indices.begin(), d_indices_2.begin());
        PRINTER(d_indices_2);
    
    
      return 0;
    }