Question

我已经测试了这个片段并尝试解释其原因以及解决问题的方法，但未能这样做

#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>

#include <thrust/execution_policy.h>

#include <iostream>
#include <cmath>
#include <boost/concept_check.hpp>

struct alter_tuple {

    alter_tuple(const int& a_, const int& b_) : a(a_), b(b_){};

    __host__ __device__
    thrust::tuple<int,int> operator()(thrust::tuple<int,int> X)
    {
    int Xx = thrust::get<0>(X);
    int Xy = thrust::get<1>(X);
    int Xpx = a*Xx-b*Xy;
    int Xpy = -b*Xx+a*Xy;

    printf("in (%d,%d) -> (%d,%d)\n",Xx,Xy,Xpx,Xpy);

        return thrust::make_tuple(Xpx,Xpy);
    }

    int a;   // these variables a,b are shared between different threads used by this functor kernel
    int b;   // which easily creates racing problem
};


struct alter_tuple_arr {

    alter_tuple_arr(int* a_, int* b_, int* c_, int* d_) : a(a_), b(b_), c(c_), d(d_) {};

    __host__ __device__
    thrust::tuple<int,int> operator()(const int& idx)
    {
    int Xx = a[idx];
    int Xy = b[idx]; 
    int Xpx = a[idx]*Xx-b[idx]*Xy;
    int Xpy = -b[idx]*Xx+a[idx]*Xy;

    printf("in (%d,%d) -> (%d,%d)\n",Xx,Xy,Xpx,Xpy);

        return thrust::make_tuple(Xpx,Xpy);
    }

    int* a;
    int* b;
    int* c;
    int* d;
};


struct bFuntor 
{
    bFuntor(int* av__, int* bv__, int* cv__, int* dv__, const int& N__) : av_(av__), bv_(bv__), cv_(cv__), dv_(dv__), N_(N__) {};

    __host__ __device__
    int operator()(const int& idx)
    {
    thrust::device_ptr<int> av_dpt = thrust::device_pointer_cast(av_);
    thrust::device_ptr<int> av_dpt1 = thrust::device_pointer_cast(av_+N_);
    thrust::device_ptr<int> bv_dpt = thrust::device_pointer_cast(bv_);
    thrust::device_ptr<int> bv_dpt1 = thrust::device_pointer_cast(bv_+N_);
    thrust::device_ptr<int> cv_dpt = thrust::device_pointer_cast(cv_);
    thrust::device_ptr<int> cv_dpt1 = thrust::device_pointer_cast(cv_+N_);
    thrust::device_ptr<int> dv_dpt = thrust::device_pointer_cast(dv_);
    thrust::device_ptr<int> dv_dpt1 = thrust::device_pointer_cast(dv_+N_);

    thrust::detail::normal_iterator<thrust::device_ptr<int>> a0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(av_dpt);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> a1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(av_dpt1);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> b0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(bv_dpt);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> b1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(bv_dpt1);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> c0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> c1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(cv_dpt1);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> d0 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(dv_dpt);
    thrust::detail::normal_iterator<thrust::device_ptr<int>> d1 = thrust::detail::make_normal_iterator<thrust::device_ptr<int>>(dv_dpt1);

    // ** alter_tuple is WRONG
#define WRONG
#ifdef WRONG
    thrust::transform(thrust::device,
              thrust::make_zip_iterator(thrust::make_tuple(a0,b0)),
              thrust::make_zip_iterator(thrust::make_tuple(a1,b1)),
//            thrust::make_zip_iterator(thrust::make_tuple(cv_dpt,dv_dpt)),   // cv_dpt  
              thrust::make_zip_iterator(thrust::make_tuple(c0,d0)),   // cv_dpt  
              alter_tuple(cv_[idx],dv_[idx]));    
#endif

#ifdef RIGHT
    // ** alter_tuple_arr is CORRECT way to do it
    thrust::transform(thrust::device,
              thrust::counting_iterator<int>(0),
              thrust::counting_iterator<int>(N_),
//            thrust::make_zip_iterator(thrust::make_tuple(cv_dpt,dv_dpt)),   // cv_dpt  
              thrust::make_zip_iterator(thrust::make_tuple(c0,d0)),   // cv_dpt  
              alter_tuple_arr(av_,bv_,cv_,dv_));    
#endif

    for (int i=0; i<N_; i++)
      printf("out: (%d,%d) -> (%d,%d)\n",av_[i],bv_[i],cv_[i],dv_[i]);

        return cv_dpt[idx];
    }

    int* av_;
    int* bv_;
    int* cv_;
    int* dv_;
    int N_;
    float af;   // are these variables host side or device side??
};


__host__ __device__
unsigned int hash(unsigned int a)
{
    a = (a+0x7ed55d16) + (a<<12);
    a = (a^0xc761c23c) ^ (a>>19);
    a = (a+0x165667b1) + (a<<5);
    a = (a+0xd3a2646c) ^ (a<<9);
    a = (a+0xfd7046c5) + (a<<3);
    a = (a^0xb55a4f09) ^ (a>>16);
    return a;
}


int main(void)
{
      int N = 10;
      std::vector<int> av,bv,cv,dv;
      unsigned int seed = hash(10);
      thrust::default_random_engine rng(seed);
      thrust::uniform_real_distribution<float> u01(0,10);

      for (int i=0;i<N;i++) {
    av.push_back((int)u01(rng));
    bv.push_back((int)u01(rng));
    cv.push_back((int)u01(rng));
    dv.push_back((int)u01(rng));

//  printf("%d %d %d %d \n",av[i],bv[i],cv[i],dv[i]);
      }

      thrust::device_vector<int> av_d(N);
      thrust::device_vector<int> bv_d(N);
      thrust::device_vector<int> cv_d(N);
      thrust::device_vector<int> dv_d(N);
      av_d = av; bv_d = bv; cv_d = cv; dv_d = dv;

      thrust::transform(thrust::counting_iterator<int>(0),
            thrust::counting_iterator<int>(N),
            cv_d.begin(),
            bFuntor(thrust::raw_pointer_cast(av_d.data()), 
                  thrust::raw_pointer_cast(bv_d.data()),
                  thrust::raw_pointer_cast(cv_d.data()),
                  thrust::raw_pointer_cast(dv_d.data()),
                  N));    

      thrust::host_vector<int> bv_h(N);
      thrust::copy(bv_d.begin(), bv_d.end(), bv_h.begin());   // probably I forgot this! to copy back the result from device to host!

      return 0;
}

在这个嵌套的推力调用中，测试了两个嵌套的仿函数，其中一个工作（一个用“#define RIGHT”）。在WRONG仿函数的情况下，即alter_tuple：

两个变量int a，int b在哪里？主机还是设备？或本地内核寄存器？或者它们在这个仿函数的运算符的线程之间共享？
在内部，alter_tuple仿函数，我试图打印出结果（int printf（“in ...”）），这是正确的计算。但是，当此结果返回给调用者仿函数并打印出来时（在printf（“out ....”）中），它们是不正确的，与之前的计算不同

为什么这些结果会有所不同？我似乎无法解释它，并且没有文件或示例来引用

此差异显示在输出here

中

修改1 ：

最小尺寸测试代码显示仿函数（字面意思是 a * x = y ）在两种情况下正确接收/初始化值SO_example_no_tuple_arr_wo_c.cu

打印出来是：

out: 9*8 -> 72
out: 9*8 -> 72
out: 9*8 -> 72
out: 6*4 -> 24
out: 6*4 -> 24
out: 6*4 -> 24
out: 1*8 -> 8
out: 1*8 -> 8
out: 1*6 -> 6
out: 9*1 -> 9
out: 9*1 -> 9

显示正确的接收值

不使用指针/数组传递输入值的最小测试代码表明，无论输入值是否正确初始化，返回结果都是错误的SO_example_no_tuple.cu

在N = 2的情况下输出：

in 9*8 -> 72
in 6*4 -> 24
in 9*8 -> 72
in 6*4 -> 24
out: 9*8 -> 24
out: 9*8 -> 24
out: 6*4 -> 24
out: 6*4 -> 24

Answer 1

值的差异并非严格地归因于数据竞争问题。

您的两种方法不会做同样的事情，它与a和b的值有关，这些值将在嵌套thrust::transform调用的每次调用中被选中。如果您设置N = 1，这很明显，这应该消除对数据竞赛的任何担忧。结果仍然不同。

在“失败”的情况下，您正在调用alter_tuple()运算符，如下所示：

thrust::transform(thrust::device,
          ...
          alter_tuple(cv_[idx],dv_[idx]));

然后，这些值（cv_[idx]，dv_[idx]）将成为您的初始化参数，最后是仿函数内的a和b个变量。但是，您的“传递”案例有效地使用a[idx]和b[idx]来有效地初始化这些变量，这些变量对应于av_[idx]和bv_[idx]。如果我们将alter_tuple调用更改为使用a和b：

          alter_tuple(av_[idx],bv_[idx]));

然后N = 1个案例结果现在匹配。这更容易理解，因为我们实际上只有a，b，c，d向量中的一个条目。

但是，当我们扩展到N = 10的情况时，我们不再获得匹配的结果。为了解释原因，我们需要了解在这种情况下在函子内使用a和b。在“失败”的情况下，我们正在为函数中使用的a和b传递单个初始化值：

          alter_tuple(av_[idx],bv_[idx]));

因此，对于给定的线程，这意味着对嵌套thrust::transform调用的给定调用，单个值将用于a和{{1} }：

另一方面，在“传递”的情况下，嵌套变换调用中alter_tuple(const int& a_, const int& b_) : a(a_), b(b_){}; ... int a; // these values are constant across variation of "idx" int b; // passed to the functor和a值对于传递给函子的每个元素会有所不同：

b

一旦理解了，如果“传递”的情况是理想的情况，那么我不知道如何转换第一个案例以产生传递结果，因为你无法导致单个初始化值，以便在“传递”案例中采用thrust::tuple<int,int> operator()(const int& idx) { int Xx = a[idx]; // these values of a and b *vary* for each idx int Xy = b[idx]; // passed to the functor和a的变化值的行为。

以上都不涉及数据竞争，但由于您的操作（即每个线程）正在写入b和c的每个值，我认为这种整体方法没有任何意义，而且我不确定你想要完成什么。我想如果你将它扩展到更多的元素/线程，那么你肯定会遇到不可预测/可变的结果。

要回答其他一些问题，变量d和a最终会在设备上作为线程局部变量。因此，任一仿函数中的每个数据成员都是设备上的线程局部变量。

在内部，alter_tuple仿函数，我试图打印出结果（int printf（“in ...”）），这是正确的计算。但是，当此结果返回给调用者仿函数并打印出来时（在printf（“out ....”）中），它们是不正确的，并且与之前的计算不同

每个帖子都写入b和c向量中的相同位置。因此，由于每个线程都写入整个向量，但（在失败的情况下）每个线程使用d和{{1}的不同的初始化值在仿函数内部，按理说每个线程将为a和b的值计算不同的结果，并且在完成推力调用后得到的结果将取决于哪个线程“赢得“输出写操作。这是不可预测的，当然并非所有线程printout都会匹配最终结果，因为每个线程将为c和d计算不同的值。

它是嵌套推力函数中的数据竞争吗？

1 个答案: