Question

我有以下输入数据：

e = 0 0 0 0 0 0 | 1 1 1
t = 1 1 4 4 4 5 | 1 6 7
i = 0 1 2 3 4 5 | 6 7 8 // indices from [0,n-1]

数据首先按e排序，然后按t排序。 e是标识数据中的细分的关键。在这种情况下：

segment_0 = [0,5]
segment_1 = [6,8]

每个细分受到t的细分。在这种情况下：

sub_segment_0_0 = [0,1] // t==1
sub_segment_0_1 = [2,4] // t==4
sub_segment_0_2 = [5,5] // t==5

sub_segment_1_0 = [6,6] // t==1
sub_segment_1_1 = [7,7] // t==6
sub_segment_1_2 = [8,8] // t==7

我想创建以下输出序列：

f = 2 2 5 5 5 6 | 7 8 9
l = 6 6 6 6 6 6 | 9 9 9

f包含当前细分中下一个sub_segment的起始索引。

l包含（当前段中最后一个sub_segment的结束索引）+ 1.

对于每个段的最后一个sub_segment，两个值都应指向其结束索引。

为了计算f，我尝试使用thrust::upper_bound，但这只适用于我只有一个sub_segment：

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <stdint.h>
#include <iostream>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0};
    uint32_t t[] = {1,1,4,4,4,5};
    uint32_t i[] = {0,1,2,3,4,5};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec upper(size);
    thrust::upper_bound(d_t.begin(), d_t.end(), d_t.begin(), d_t.end(), upper.begin());
    PRINTER(upper);

    return 0;
}

输出：

d_e:    0   0   0   0   0   0   
d_t:    1   1   4   4   4   5   
d_i:    0   1   2   3   4   5   
upper:  2   2   5   5   5   6

如果我使用包含两个sub_segments的输入数据，它就不再工作了，因为没有thrust::upper_bound_by_key：

// replace in the code above
uint32_t e[] = {0,0,0,0,0,0,1,1,1};
uint32_t t[] = {1,1,4,4,4,5,1,6,7};
uint32_t i[] = {0,1,2,3,4,5,6,7,8};

输出

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
upper:  2   2   7   7   7   7   2   8   9

如何为我的数据实施upper_bound_by_key？

我怎样才能有效地计算l？

我愿意接受任何解决方案，并不是必需品。

Answer 1

这是一种可能的方法：

标记（t-）段的结尾。我假设电子细分市场可能只有一个t段。如果是这种情况，那么相邻的电子段可能具有相同数值的t段（可能是1）。因此，标记细分的结尾需要同时考虑e和t。我使用一种基本上与邻近差异相似的方法，除了它使用e同时考虑t和thrust::transform并转移e和t的表示。
确定f为每个细分所保留的值。现在我们知道了每个（t-）段的结束，我们可以简单地从i中选择下一个值（使用copy_if，并将段结束标记作为我们的模板）作为{{1前一段的值。为了实现这一点，并且由于您的f只是一个索引序列，我创建的i向量比您显示的要长一个元素。
为每个细分创建一个数字增长的索引。这只是对步骤1中创建的矢量的独占扫描。
使用在步骤3中创建的索引序列，将步骤2中创建的i段值“分散”到我们的f结果中（“散布”用{{1}完成和permuation迭代器）。

这是一个有用的例子，借鉴你的代码：

可以使用非常相似的序列来创建thrust::copy向量。

Answer 2

我找到了另一种方法。

为了能够使用lower_bound，我需要确保t 全局排序。为了做到这一点，我首先找出每个的起点使用adjacent_difference的sub_segment。之后，scatter_if为子段的每个起点复制counting_iterator的数字。最后，inclusive_scan为每个子细分传播相同的值。我将inclusive_scan之前的两个步骤合并到自定义仿函数my_scatter中，以实现更好的内核融合。

现在upper_bound会应用于这些全局增加的值，以计算f。

可以通过在l上应用upper_bound来计算

e。

我不确定我的方法的效率与@RobertCrovella提出的方法相比如何。

<强>输出：

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
norm_t: 0   0   2   2   2   7   13  20  28  
d_f:    2   2   5   5   5   6   7   8   9   
d_l:    6   6   6   6   6   6   9   9   9

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/adjacent_difference.h>
#include <thrust/functional.h>
#include <stdint.h>
#include <iostream>
#include <thrust/scatter.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

template <typename IteratorType, typename IndexType = uint32_t>
struct my_scatter : public thrust::unary_function<IndexType,IndexType>
{
    my_scatter(IteratorType first) : first(first)
    {
    }

   __host__ __device__
   IndexType operator()(const IndexType& i)
   {
      IndexType result = i;
      if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
      { 
          result = static_cast<IndexType>(0);
      }
      return result;
   }

   IteratorType first;
};

template <typename IteratorType>
my_scatter<IteratorType> make_my_scatter(IteratorType first)
{
  return my_scatter<IteratorType>(first);
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0,1,1,1};
    uint32_t t[] = {1,1,4,4,4,5,1,6,7};
    uint32_t i[] = {0,1,2,3,4,5,6,7,8};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;    
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec norm_t(size);

    auto my_scatter_op =  make_my_scatter(zip(d_e.begin(), d_t.begin()));
    auto ti_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), my_scatter_op);
    auto ti_end = thrust::make_transform_iterator(thrust::make_counting_iterator(size), my_scatter_op);
    thrust::inclusive_scan(ti_begin, ti_end, norm_t.begin());
    PRINTER(norm_t);

    DVec d_f(size);
    thrust::upper_bound(norm_t.begin(), norm_t.end(), norm_t.begin(), norm_t.end(), d_f.begin());    
    PRINTER(d_f);

    DVec d_l(size);
    thrust::upper_bound(d_e.begin(), d_e.end(), d_e.begin(), d_e.end(), d_l.begin());    
    PRINTER(d_l);

    return 0;
}

CUDA /推力中分段数据的矢量化上界

2 个答案: