Question

这是我关于Stack Overflow的第一个问题，这是一个很长的问题。 tl; dr版本是：如果我希望它同时存储不同类型thrust::device_vector<BaseClass>，DerivedClass1等对象，我如何使用DerivedClass2？

我想利用CUDA Thrust的多态性。我正在编译-arch=sm_30 GPU（GeForce GTX 670）。

让我们看一下以下问题：假设镇上有80个家庭。其中60人是已婚夫妇，其中20人是单亲家庭。因此，每个家庭都有不同数量的成员。这是人口普查时间，家庭必须说明父母的年龄和孩子的数量。因此，政府构建了Family个对象的数组，即thrust::device_vector<Family> familiesInTown(80)，以便家庭familiesInTown[0]到familiesInTown[59]的信息对应于已婚夫妇，其余的（{{ 1}}到familiesInTown[60]）是单亲家庭。

familiesInTown[79]是基类 - 家庭中的父母数量（单身父母为1，夫妻为2），他们的孩子数量作为成员存储在这里。

Family

SingleParent包含一个新成员 - 单亲的年龄Family。
unsigned int ageOfParent也来自MarriedCouple，但会引入两个新成员 - 父母的年龄，Family和unsigned int ageOfParent1。
```
unsigned int ageOfParent2
```

如果我使用以下仿函数天真地启动#include <iostream> #include <stdio.h> #include <thrust/device_vector.h> class Family { protected: unsigned int numParents; unsigned int numChildren; public: __host__ __device__ Family() {}; __host__ __device__ Family(const unsigned int& nPars, const unsigned int& nChil) : numParents(nPars), numChildren(nChil) {}; __host__ __device__ virtual ~Family() {}; __host__ __device__ unsigned int showNumOfParents() {return numParents;} __host__ __device__ unsigned int showNumOfChildren() {return numChildren;} }; class SingleParent : public Family { protected: unsigned int ageOfParent; public: __host__ __device__ SingleParent() {}; __host__ __device__ SingleParent(const unsigned int& nChil, const unsigned int& age) : Family(1, nChil), ageOfParent(age) {}; __host__ __device__ unsigned int showAgeOfParent() {return ageOfParent;} }; class MarriedCouple : public Family { protected: unsigned int ageOfParent1; unsigned int ageOfParent2; public: __host__ __device__ MarriedCouple() {}; __host__ __device__ MarriedCouple(const unsigned int& nChil, const unsigned int& age1, const unsigned int& age2) : Family(2, nChil), ageOfParent1(age1), ageOfParent2(age2) {}; __host__ __device__ unsigned int showAgeOfParent1() {return ageOfParent1;} __host__ __device__ unsigned int showAgeOfParent2() {return ageOfParent2;} };中的对象：

thrust::device_vector<Family>

我肯定会犯一些经典的object slicing ......

所以，我问自己，一个可能给我一些甜的多态性的指针矢量怎么样？ C ++中的Smart pointers是一个东西，struct initSlicedCouples : public thrust::unary_function<unsigned int, MarriedCouple> { __device__ MarriedCouple operator()(const unsigned int& idx) const // I use a thrust::counting_iterator to get idx { return MarriedCouple(idx % 3, 20 + idx, 19 + idx); // Couple 0: Ages 20 and 19, no children // Couple 1: Ages 21 and 20, 1 child // Couple 2: Ages 22 and 21, 2 children // Couple 3: Ages 23 and 22, no children // etc } }; struct initSlicedSingles : public thrust::unary_function<unsigned int, SingleParent> { __device__ SingleParent operator()(const unsigned int& idx) const { return SingleParent(idx % 3, 25 + idx); } }; int main() { unsigned int Num_couples = 60; unsigned int Num_single_parents = 20; thrust::device_vector<Family> familiesInTown(Num_couples + Num_single_parents); // Families [0] to [59] are couples. Families [60] to [79] are single-parent households. thrust::transform(thrust::counting_iterator<unsigned int>(0), thrust::counting_iterator<unsigned int>(Num_couples), familiesInTown.begin(), initSlicedCouples()); thrust::transform(thrust::counting_iterator<unsigned int>(Num_couples), thrust::counting_iterator<unsigned int>(Num_couples + Num_single_parents), familiesInTown.begin() + Num_couples, initSlicedSingles()); return 0; }迭代器可以做一些非常令人印象深刻的事情，所以让我们试一试，我想。以下代码编译。

thrust

好像我在这里撞墙了。我正确理解内存管理吗？（VTables等）。我的对象是否在设备上实例化并填充？我是否因为没有明天而泄露记忆？

为了它的价值，为了避免对象切片，我尝试了struct initCouples : public thrust::unary_function<unsigned int, MarriedCouple*> { __device__ MarriedCouple* operator()(const unsigned int& idx) const { return new MarriedCouple(idx % 3, 20 + idx, 19 + idx); // Memory issues? } }; struct initSingles : public thrust::unary_function<unsigned int, SingleParent*> { __device__ SingleParent* operator()(const unsigned int& idx) const { return new SingleParent(idx % 3, 25 + idx); } }; int main() { unsigned int Num_couples = 60; unsigned int Num_single_parents = 20; thrust::device_vector<Family*> familiesInTown(Num_couples + Num_single_parents); // Families [0] to [59] are couples. Families [60] to [79] are single-parent households. thrust::transform(thrust::counting_iterator<unsigned int>(0), thrust::counting_iterator<unsigned int>(Num_couples), familiesInTown.begin(), initCouples()); thrust::transform(thrust::counting_iterator<unsigned int>(Num_couples), thrust::counting_iterator<unsigned int>(Num_couples + Num_single_parents), familiesInTown.begin() + Num_couples, initSingles()); Family A = *(familiesInTown[2]); // Compiles, but object slicing takes place (in theory) std::cout << A.showNumOfParents() << "\n"; // Segmentation fault return 0; }。这就是我制作dynamic_cast<DerivedPointer*>(basePointer)析构函数Family的原因。

virtual

以下几行编译，但不幸的是，再次抛出了一个段错误。 CUDA-Memcheck不会告诉我原因。

Family *pA = familiesInTown[2];
MarriedCouple *pB = dynamic_cast<MarriedCouple*>(pA);

和

  std::cout << "Ages " << (pB -> showAgeOfParent1()) << ", " << (pB -> showAgeOfParent2()) << "\n";

简而言之，我需要的是一个具有不同属性的对象的类接口，彼此之间的成员数量不同，但我可以存储在一个公共向量中（这就是为什么我想要一个我可以在GPU上操作的基类。我的目的是通过MarriedCouple B = *pB; std::cout << "Ages " << B.showAgeOfParent1() << ", " << B.showAgeOfParent2() << "\n";转换和CUDA内核中的thrust转换，这对我来说是完美无缺的，直到我需要将我的类分支到一个基本的和一个派生的那些。该标准程序是什么？

提前致谢！

Answer 1

我不打算回答这个问题中的所有内容，它太大了。虽然这里有一些关于你发布的代码的观察结果可能有所帮助：

GPU端new运算符从私有运行时堆分配内存。从CUDA 6开始，主机端CUDA API无法访问该内存。您可以从内核和设备功能中访问内存，但主机无法访问该内存。因此，在推力装置仿函数中使用new是一种破碎的设计，永远不会起作用。这就是你的“指针矢量”模型失败的原因。
Thrust基本上旨在允许将典型STL算法的数据并行版本应用于POD类型。使用复杂的多态对象构建代码库并尝试通过Thrust容器和算法填充代码库可能会起作用，但它不是Thrust的设计目标，我不推荐它。如果你以意想不到的方式突破推力，不要感到惊讶。
CUDA支持许多C ++功能，但编译和对象模型比它们所基于的C ++ 98标准简单得多。 CUDA缺少几个关键特性（例如RTTI），这使得复杂的多态对象设计可以在C ++中使用。我的建议是谨慎使用C ++功能。仅仅因为你可以在CUDA中做某事并不意味着你应该这样做。 GPU是一个简单的架构，简单的数据结构和代码几乎总是比功能相似的复杂对象更高效。

浏览你发布的代码时，我的总体建议是回到绘图板。如果你想看一些非常优雅的CUDA / C ++设计，花点时间阅读CUB和CUSP的代码库。它们都非常不同，但是从两者中可以学到很多东西（而且CUSP建立在Thrust之上，这使得它与你的使用案例更加相关，我怀疑）。

Answer 2

我完全同意@talonmies的回答。（例如，我不知道推力已被多态性广泛测试。）此外，我还没有完全解析你的代码。我发布这个答案来添加额外的信息，特别是我相信某种程度的多态性可以用来推动。

我要做的一个关键观察是it is not allowed to pass as an argument to a __global__ function an object of a class with virtual functions.这意味着在主机上创建的多态对象无法传递给设备（通过推力或普通的CUDA C ++）。（此限制的一个基础是对象中虚拟功能表的要求，主机和设备之间必然不同，并且直接在主机代码中获取设备功能的地址是非法的。 / p>

但是，polymorphism可以在设备代码中工作，包括推力设备功能。

以下示例演示了这个想法，将自己限制在设备上创建的对象，尽管我们当然可以使用主机数据初始化它们。我创建了两个类Triangle和Rectangle，派生自基类Polygon，其中包含虚函数area。 Triangle和Rectangle从基类继承函数set_values，但替换虚拟area函数。

然后我们可以多态地操纵这些类的对象，如下所示：

#include <iostream> #include <thrust/device_vector.h> #include <thrust/for_each.h> #include <thrust/sequence.h> #include <thrust/iterator/zip_iterator.h> #include <thrust/copy.h> #define N 4 class Polygon { protected: int width, height; public: __host__ __device__ void set_values (int a, int b) { width=a; height=b; } __host__ __device__ virtual int area () { return 0; } }; class Rectangle: public Polygon { public: __host__ __device__ int area () { return width * height; } }; class Triangle: public Polygon { public: __host__ __device__ int area () { return (width * height / 2); } }; struct init_f { template <typename Tuple> __host__ __device__ void operator()(const Tuple &arg) { (thrust::get<0>(arg)).set_values(thrust::get<1>(arg), thrust::get<2>(arg));} }; struct setup_f { template <typename Tuple> __host__ __device__ void operator()(const Tuple &arg) { if (thrust::get<0>(arg) == 0) thrust::get<1>(arg) = &(thrust::get<2>(arg)); else thrust::get<1>(arg) = &(thrust::get<3>(arg));} }; struct area_f { template <typename Tuple> __host__ __device__ void operator()(const Tuple &arg) { thrust::get<1>(arg) = (thrust::get<0>(arg))->area();} }; int main () { thrust::device_vector<int> widths(N); thrust::device_vector<int> heights(N); thrust::sequence( widths.begin(), widths.end(), 2); thrust::sequence(heights.begin(), heights.end(), 3); thrust::device_vector<Rectangle> rects(N); thrust::device_vector<Triangle> trgls(N); thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(rects.begin(), widths.begin(), heights.begin())), thrust::make_zip_iterator(thrust::make_tuple(rects.end(), widths.end(), heights.end())), init_f()); thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(trgls.begin(), widths.begin(), heights.begin())), thrust::make_zip_iterator(thrust::make_tuple(trgls.end(), widths.end(), heights.end())), init_f()); thrust::device_vector<Polygon *> polys(N); thrust::device_vector<int> selector(N); for (int i = 0; i<N; i++) selector[i] = i%2; thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(selector.begin(), polys.begin(), rects.begin(), trgls.begin())), thrust::make_zip_iterator(thrust::make_tuple(selector.end(), polys.end(), rects.end(), trgls.end())), setup_f()); thrust::device_vector<int> areas(N); thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(polys.begin(), areas.begin())), thrust::make_zip_iterator(thrust::make_tuple(polys.end(), areas.end())), area_f()); thrust::copy(areas.begin(), areas.end(), std::ostream_iterator<int>(std::cout, "\n")); return 0; }

我建议为cc2.0或更新的架构编译上面的代码。我在RHEL 5.5上使用CUDA 6进行了测试。

（多态示例的想法和一些代码来自here。）

CUDA / CUDA推力中的多态性和派生类

2 个答案: