CUDA Thrust - 计算匹配的子阵列

时间:2015-09-03 21:52:58

标签: c++ arrays cuda thrust entropy

我试图弄清楚是否有可能使用CUDA有效地计算一组数字的条件熵。您可以通过将数组划分为窗口来计算条件熵,然后计算不同长度的匹配子阵列/子串的数量。对于每个子阵列长度,您可以通过将匹配的子阵列计数与这些计数的对数相加来计算熵。然后,无论你得到的最小熵是条件熵。

为了更明确地说明我的意思,这里有完整的计算:

  1. 初始数组是[1,2,3,5,1,2,5]。假设窗口大小为3,则必须将其分为五个窗口:[1,2,3],[2,3,5],[3,5,1],[5,1,2]和[1] 1,2,5]。

  2. 接下来,查看每个窗口,我们希望找到每个长度的匹配子数组。

  3. 长度为1的子阵列为[1],[2],[3],[5],[1]。有两个1,每个其中一个。所以熵是log(2) 2 + 4 (log(1)* 1)= 0.6。

  4. 长度为2的子阵列为[1,2],[2,3],[3,5],[5,1]和[1,2]。有两个[1,2] s和四个独特的子阵列。熵与长度1相同,log(2) 2 + 4 (log(1)* 1)= 0.6。

  5. 长度为3的子阵列是完整的窗口:[1,2,3],[2,3,5],[3,5,1],[5,1,2]和[ 1,2,5]。所有五个窗口都是唯一的,因此熵是5 *(log(1)* 1)= 0.

  6. 最小熵为0,表示它是该数组的条件熵。

  7. 这也可以表示为树,其中每个节点的计数表示存在多少匹配。每个子阵列长度的熵等于树的每个级别的熵。 Conditional Entropy Tree

    如果可能,我想一次在许多阵列上执行此计算,并且还要并行执行计算。有没有人有关于如何做到这一点的建议?推力有用吗?如果我有任何其他信息,请告诉我。

1 个答案:

答案 0 :(得分:2)

我尝试用推力解决你的问题。它可以工作,但它会产生很多的推力调用。 由于您的输入大小相当小,您应该并行处理多个数组。 但是,这样做会导致大量的簿记工作,您将在以下代码中看到这一点。

您的输入范围仅限于[1,5],相当于[0,4]。一般的想法是(理论上)该范围之外的任何元组(例如{1,2,3}可以表示为基数4中的数字(例如1+2*4+3*16 = 57)。 在实践中,我们受整数类型的大小限制。对于32位无符号整数,这将导致最大元组大小为16。这也是以下代码可以处理的最大窗口大小(更改为64位无符号整数将导致最大元组大小为32)。

假设输入数据的结构如下: 我们要并行处理2个数组,每个数组的大小为5,窗口大小为3

{{0,0,3,4,4},{0,2,1,1,3}}

我们现在可以生成所有窗口:

{{0,0,3},{0,3,4},{3,4,4}},{{0,2,1},{2,1,1},{1,1,3}}

使用每个元组前缀和并将每个元组的上述表示应用为单个base-4数字,我们得到:

{{0,0,48},{0,12,76},{3,19,83}},{{0,8,24},{2,6,22},{1,5,53}}

现在我们重新排序这些值,以便我们得到代表彼此相邻的特定长度的子阵列的数字:

{{0,0,3},{0,12,19},{48,76,83}},{0,2,1},{8,6,5},{24,22,53}}

然后我们在每个组内进行排序:

{{0,0,3},{0,12,19},{48,76,83}},{0,1,2},{5,6,8},{22,24,53}}

现在我们可以计算每个组中出现数字的频率:

2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1

应用日志公式结果

0.60206,0,0,0,0,0

现在我们获取每个数组的最小值:

0,0
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <iostream>
#include <thrust/tuple.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/gather.h>
#include <thrust/sort.h>
#include <math.h>

#include <chrono>

#ifdef PRINT_ENABLED
#define PRINTER(name) print(#name, (name))
#else
#define PRINTER(name)
#endif

template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}


template <typename Integer, Integer Min, Integer Max>
struct random_filler
{
    __device__
    Integer operator()(std::size_t index) const
    {
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution<Integer> dist(Min, Max);
        rng.discard(index);
        return dist(rng);
    }
};

template <std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          typename T,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>

__device__ __inline__          
thrust::tuple<T,T,T,T> calc_indices(const T& i0)
{
    const T i1 = i0 / PerArrayCount;
    const T i2 = i0 % PerArrayCount;
    const T i3 = i2 / WindowSize;
    const T i4 = i2 % WindowSize;
    return thrust::make_tuple(i1,i2,i3,i4);
}

template <typename Iterator, 
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount,
          std::size_t TotalCount =  PerArrayCount * ArrayCount
          >
class sliding_window
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct window_functor : public thrust::unary_function<difference_type,difference_type>
    {
        __host__ __device__
        difference_type operator()(const difference_type& i0) const
        { 
            auto t = calc_indices<ArraySize, ArrayCount,WindowSize>(i0);

            return thrust::get<0>(t) * ArraySize + thrust::get<2>(t)  + thrust::get<3>(t);
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<window_functor, CountingIterator>   TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    typedef PermutationIterator iterator;

    sliding_window(Iterator first) : first(first){}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), window_functor()));
    }

    iterator end(void) const
    {
        return begin() + TotalCount;
    }

    protected:
    Iterator first;
};

template <std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          typename Iterator>
sliding_window<Iterator, ArraySize, ArrayCount, WindowSize>
make_sliding_window(Iterator first)
{
    return sliding_window<Iterator, ArraySize, ArrayCount, WindowSize>(first);
}


template <typename KeyType, 
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize> 
struct key_generator : thrust::unary_function<KeyType, thrust::tuple<KeyType,KeyType> >
{
    __device__
    thrust::tuple<KeyType,KeyType> operator()(std::size_t i0) const
    {        

        auto t = calc_indices<ArraySize, ArrayCount,WindowSize>(i0);
        return thrust::make_tuple(thrust::get<0>(t),thrust::get<2>(t));
    }
};


template <typename Integer,
          std::size_t Base,
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize>
struct base_n : thrust::unary_function<thrust::tuple<Integer, Integer>, Integer>
{
    __host__ __device__ 
    Integer operator()(const thrust::tuple<Integer, Integer> t) const
    {    
        const auto i = calc_indices<ArraySize, ArrayCount, WindowSize>(thrust::get<0>(t));

        // ipow could be optimized by precomputing a lookup table at compile time
        const auto result =  thrust::get<1>(t)*ipow(Base, thrust::get<3>(i));
        return result;
    }

    // taken from http://stackoverflow.com/a/101613/678093
    __host__ __device__ __inline__
    Integer ipow(Integer base, Integer exp) const
    {
        Integer result = 1;
        while (exp)
        {
            if (exp & 1)
                result *= base;
            exp >>= 1;
            base *= base;
        }

        return result;
    }
};

template <std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          typename T,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>

__device__ __inline__          
thrust::tuple<T,T,T,T> calc_sort_indices(const T& i0)
{
    const T i1 = i0 % PerArrayCount;
    const T i2 = i0 / PerArrayCount;
    const T i3 = i1 % WindowCount;
    const T i4 = i1 / WindowCount;
    return thrust::make_tuple(i1,i2,i3,i4);
}

template <typename Integer,
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>
struct pre_sort : thrust::unary_function<Integer, Integer>
{
    __device__
    Integer operator()(Integer i0) const
    {

        auto t = calc_sort_indices<ArraySize, ArrayCount,WindowSize>(i0);


        const Integer i_result = ( thrust::get<2>(t)  * WindowSize  + thrust::get<3>(t) ) + thrust::get<1>(t) * PerArrayCount;
        return i_result;
    }
};


template <typename Integer,
          std::size_t ArraySize, 
          std::size_t ArrayCount,
          std::size_t WindowSize,
          std::size_t WindowCount = ArraySize - (WindowSize-1),
          std::size_t PerArrayCount = WindowSize * WindowCount>
struct generate_sort_keys : thrust::unary_function<Integer, Integer>
{
    __device__
    thrust::tuple<Integer,Integer> operator()(Integer i0) const
    {

        auto t = calc_sort_indices<ArraySize, ArrayCount,WindowSize>(i0);

        return thrust::make_tuple( thrust::get<1>(t), thrust::get<3>(t));
    }
};


template<typename... Iterators>
__host__ __device__
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
    return thrust::make_zip_iterator(thrust::make_tuple(its...));
}


struct calculate_log : thrust::unary_function<std::size_t, float>
{
  __host__ __device__
  float operator()(std::size_t i) const
  {
    return i*log10f(i);
  }
};


int main()
{
    typedef int Integer;
    typedef float Real;

    const std::size_t array_count = ARRAY_COUNT;
    const std::size_t array_size = ARRAY_SIZE;

    const std::size_t window_size = WINDOW_SIZE;

    const std::size_t window_count = array_size - (window_size-1);

    const std::size_t input_size = array_count * array_size;

    const std::size_t base = 4;

    thrust::device_vector<Integer> input_arrays(input_size);

    thrust::counting_iterator<Integer> counting_it(0);

    thrust::transform(counting_it,
                      counting_it + input_size,
                      input_arrays.begin(),
                      random_filler<Integer,0,base>());

    PRINTER(input_arrays);

    const int runs = 100;

    auto start = std::chrono::high_resolution_clock::now();
    for (int k = 0 ; k < runs; ++k)
    {
      auto sw = make_sliding_window<array_size, array_count, window_size>(input_arrays.begin());

      const std::size_t total_count = window_size * window_count * array_count;

      thrust::device_vector<Integer> result(total_count);

      thrust::copy(sw.begin(), sw.end(), result.begin());

      PRINTER(result);


      auto ti_begin = thrust::make_transform_iterator(counting_it, key_generator<Integer, array_size, array_count, window_size>());

      auto base_4_ti = thrust::make_transform_iterator(zip(counting_it, sw.begin()), base_n<Integer, base, array_size, array_count, window_size>());

      thrust::inclusive_scan_by_key(ti_begin, ti_begin+total_count, base_4_ti, result.begin());
      PRINTER(result);

      thrust::device_vector<Integer> result_2(total_count);

      auto ti_pre_sort = thrust::make_transform_iterator(counting_it, pre_sort<Integer, array_size, array_count, window_size>());

      thrust::gather(ti_pre_sort,
                    ti_pre_sort+total_count,
                    result.begin(),
                    result_2.begin());

      PRINTER(result_2);


      thrust::device_vector<Integer> sort_keys_1(total_count);
      thrust::device_vector<Integer> sort_keys_2(total_count);

      auto zip_begin = zip(sort_keys_1.begin(),sort_keys_2.begin());

      thrust::transform(counting_it,
                        counting_it+total_count,
                        zip_begin,
                        generate_sort_keys<Integer, array_size, array_count, window_size>());

      thrust::stable_sort_by_key(result_2.begin(), result_2.end(), zip_begin);
      thrust::stable_sort_by_key(zip_begin, zip_begin+total_count, result_2.begin());

      PRINTER(result_2);

      thrust::device_vector<Integer> key_counts(total_count);
      thrust::device_vector<Integer> sort_keys_1_reduced(total_count);
      thrust::device_vector<Integer> sort_keys_2_reduced(total_count);

      // count how often each sub array occurs
      auto zip_count_begin = zip(sort_keys_1.begin(), sort_keys_2.begin(), result_2.begin());
      auto new_end = thrust::reduce_by_key(zip_count_begin,
                                          zip_count_begin + total_count,
                                          thrust::constant_iterator<Integer>(1),
                                          zip(sort_keys_1_reduced.begin(), sort_keys_2_reduced.begin(), thrust::make_discard_iterator()),
                                          key_counts.begin()
                                          );

      std::size_t new_size = new_end.second - key_counts.begin();
      key_counts.resize(new_size);
      sort_keys_1_reduced.resize(new_size);
      sort_keys_2_reduced.resize(new_size);
      PRINTER(key_counts);
      PRINTER(sort_keys_1_reduced);
      PRINTER(sort_keys_2_reduced);


      auto log_ti = thrust::make_transform_iterator (key_counts.begin(), calculate_log());

      thrust::device_vector<Real> log_result(new_size);
      auto zip_keys_reduced_begin = zip(sort_keys_1_reduced.begin(), sort_keys_2_reduced.begin());
      auto log_end = thrust::reduce_by_key(zip_keys_reduced_begin,
                                          zip_keys_reduced_begin + new_size,
                                          log_ti,
                                          zip(sort_keys_1.begin(),thrust::make_discard_iterator()),
                                          log_result.begin()
                            );
      std::size_t final_size = log_end.second - log_result.begin();
      log_result.resize(final_size);
      sort_keys_1.resize(final_size);
      PRINTER(log_result);

      thrust::device_vector<Real> final_result(final_size);
      auto final_end = thrust::reduce_by_key(sort_keys_1.begin(),
                                            sort_keys_1.begin() + final_size,
                                            log_result.begin(),
                                            thrust::make_discard_iterator(),
                                            final_result.begin(),
                                            thrust::equal_to<Integer>(),
                                            thrust::minimum<Real>()
                            );

      final_result.resize(final_end.second-final_result.begin());

      PRINTER(final_result);
    }

    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start);
    std::cout << "took " << duration.count()/runs << "milliseconds" << std::endl;

    return 0;
}

使用

进行编译
nvcc -std=c++11 conditional_entropy.cu -o benchmark -DARRAY_SIZE=1000 -DARRAY_COUNT=1000 -DWINDOW_SIZE=10 && ./benchmark

我的GPU(GTX 680)上的配置需要133毫秒,因此每个阵列大约需要0.1毫秒。

绝对可以优化实施,例如使用预先计算的查找表进行base-4转换,也许可以避免一些推力调用。