我正在尝试使用CUDA Thrust来解决问题。
我有一个包含3
元素的主机数组。是否可以使用Thrust创建384
元素的设备数组,其中主机数组中的3
元素重复128
次(128 x 3 = 384
)?
一般来说,从3
元素数组开始,如何使用Thrust生成大小为X
的设备数组,其中X = Y x 3
,即Y
为重复次数?
答案 0 :(得分:5)
一种可能的方法:
此代码是对示例的跨步范围示例的一个微不足道的修改。您可以将REPS
define更改为128,以查看完整扩展到384个输出元素:
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 3
#define REPS 15 // change to 128 if you like
#define DSIZE (STRIDE*REPS)
// this example illustrates how to make strided access to a range of values
// examples:
// strided_range([0, 1, 2, 3, 4, 5, 6], 1) -> [0, 1, 2, 3, 4, 5, 6]
// strided_range([0, 1, 2, 3, 4, 5, 6], 2) -> [0, 2, 4, 6]
// strided_range([0, 1, 2, 3, 4, 5, 6], 3) -> [0, 3, 6]
// ...
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
thrust::host_vector<int> h_data(STRIDE);
h_data[0] = 1;
h_data[1] = 2;
h_data[2] = 3;
thrust::device_vector<int> data(DSIZE);
typedef thrust::device_vector<int>::iterator Iterator;
strided_range<Iterator> pos1(data.begin(), data.end(), STRIDE);
strided_range<Iterator> pos2(data.begin()+1, data.end(), STRIDE);
strided_range<Iterator> pos3(data.begin()+2, data.end(), STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_data[0]);
thrust::fill(pos2.begin(), pos2.end(), h_data[1]);
thrust::fill(pos3.begin(), pos3.end(), h_data[2]);
// print the generated data
std::cout << "data: ";
thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " ")); std::cout << std::endl;
return 0;
}
答案 1 :(得分:1)
Robert Crovella已经使用strided ranges回答了这个问题。他还指出了使用expand operator的可能性。
下面,我提供了使用expand运算符的工作示例。与使用跨步范围相反,它避免了for
循环的需要。
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/sequence.h>
#include <stdio.h>
using namespace thrust::placeholders;
/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/*******************/
/* EXPAND OPERATOR */
/*******************/
template <typename InputIterator1, typename InputIterator2, typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(), first1, output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/**************************/
/* STRIDED RANGE OPERATOR */
/**************************/
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
/********/
/* MAIN */
/********/
int main(){
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 10; // --- Number of objects
const int Ncols = 3; // --- Number of centroids
thrust::device_vector<int> d_sequence(Nrows * Ncols);
thrust::device_vector<int> d_counts(Ncols, Nrows);
thrust::sequence(d_sequence.begin(), d_sequence.begin() + Ncols);
expand(d_counts.begin(), d_counts.end(), d_sequence.begin(),
thrust::make_permutation_iterator(
d_sequence.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)));
printf("\n\nCentroid indices\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_sequence[i * Ncols + j] << " ";
std::cout << "]\n";
}
return 0;
}
答案 2 :(得分:1)
作为使用CUDA Thrust的一个明显更简单的替代方案,我发布在CUDA中实现的经典Matlab的 meshgrid 函数的工作示例下面。
在Matlab中
x = [1 2 3];
y = [4 5 6 7];
[X, Y] = meshgrid(x, y);
产生
X =
1 2 3
1 2 3
1 2 3
1 2 3
和
Y =
4 4 4
5 5 5
6 6 6
7 7 7
X
正好是x
数组的四重复制,这是OP的问题,也是罗伯特克罗维拉答案的第一次猜测,而{{1}是Y
数组的每个元素的三连续复制,这是Robert Crovella的答案的第二个猜测。
以下是代码:
y