给定两个键值列表,我试图通过匹配键并在键匹配时将函数应用于两个值来组合双方。在我的情况下,我想要乘以数值。一个小例子,使其更清晰:
Left keys: { 1, 2, 4, 5, 6 }
Left values: { 3, 4, 1, 2, 1 }
Right keys: { 1, 3, 4, 5, 6, 7 };
Right values: { 2, 1, 1, 4, 1, 2 };
Expected output keys: { 1, 4, 5, 6 }
Expected output values: { 6, 1, 8, 1 }
我已经能够使用C ++使用下一个代码在CPU上实现它:
int main() {
int leftKeys[5] = { 1, 2, 4, 5, 6 };
int leftValues[5] = { 3, 4, 1, 2, 1 };
int rightKeys[6] = { 1, 3, 4, 5, 6, 7 };
int rightValues[6] = { 2, 1, 1, 4, 1, 2 };
int leftIndex = 0, rightIndex = 0;
std::vector<std::tuple<int, int>> result;
while (leftIndex < 5 && rightIndex < 6) {
if (leftKeys[leftIndex] < rightKeys[rightIndex]) {
leftIndex++;
}
if (leftKeys[leftIndex] > rightKeys[rightIndex]) {
rightIndex++;
}
result.push_back(std::make_tuple(leftKeys[leftIndex], leftValues[leftIndex] * rightValues[rightIndex]));
leftIndex++;
rightIndex++;
}
// Print results
for (int i = 0; i < result.size(); i++) {
std::cout << "Key: " << std::get<0>(result[i]) << "; Value: " << std::get<1>(result[i]) << "\n";
}
}
但是,我在Thrust的device_vector
中有输入键和值,我也需要GPU上的结果。因此,如果我不需要将所有输入复制到主机并将所有输出复制回设备,那么效率会更高。
问题是我找不到可用于使用一组键组合两个列表的Thrust函数(并将函数应用于这两个值)。这样的功能是否存在或是否有一种简单的方法来实现它我应该在主机上执行此操作吗?
更新
可以对输入做出以下假设:
更新2:
在@ Robert的回答中实施第二种方法时,我陷入了转型的困境。到目前为止我的代码如下:
struct multiply_transformation : public thrust::binary_function<std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>>
{
__host__ __device__
thrust::tuple<int, int> operator()(thrust::tuple<int, int> d_left, thrust::tuple<int, int> d_right)
{
if (thrust::get<0>(d_left) == thrust::get<0>(d_right)) {
return thrust::make_tuple(thrust::get<0>(d_left), thrust::get<1>(d_left) * thrust::get<1>(d_right));
}
return thrust::make_tuple(-1, -1);
}
};
thrust::device_vector<int> d_mergedKeys(h_leftCount + h_rightCount);
thrust::device_vector<int> d_mergedValues(h_leftCount + h_rightCount);
thrust::merge_by_key(d_leftCountKeys.begin(), d_leftCountKeys.begin() + h_leftCount,
d_rightCountKeys.begin(), d_rightCountKeys.begin() + h_rightCount,
d_leftCounts.begin(), d_rightCounts.begin(), d_mergedKeys.begin(), d_mergedValues.begin());
typedef thrust::tuple<int, int> IntTuple;
thrust::zip_iterator<IntTuple> d_zippedCounts(thrust::make_tuple(d_mergedKeys.begin(), d_mergedValues.begin()));
thrust::zip_iterator<IntTuple> d_zippedCountsOffset(d_zippedCounts + 1);
multiply_transformation transformOperator;
thrust::device_vector<IntTuple> d_transformedResult(h_leftCount + h_rightCount);
thrust::transform(d_zippedCounts, d_zippedCounts + h_leftCount + h_rightCount - 1, d_zippedCountsOffset, d_transformedResult.begin(), transformOperator);
但是,我得到的错误是没有重载函数thrust::transform
与参数列表匹配。在上面的代码中,h_leftCount
和h_rightCount
是左右输入的大小。 d_leftCountKeys
,d_rightCountKeys
,d_leftCounts
和d_rightCounts
为thrust::device_vector<int>
。
答案 0 :(得分:3)
嗯,我不确定这是最好的方法(@ m.s。通常会提出比我更好的方法),但一种可能的方法是(方法1):
我不知道你对thrust的技能水平是什么,但如果需要,我可以提供一个简单的上述例子。
另一种可能的方法(方法2):
我的感觉是第二种方法可能更快,但我还没有仔细考虑过它。无论如何,对测试用例进行基准测试比对(我的)直觉更好。
根据下面的评论,以下是使用您的示例数据集从方法2的第2步开始的情况说明:
步骤1的输出(merge_by_key操作)看起来像这样:
keys: { 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values: { 3, 2, 4, 1, 1, 1, 2, 4, 1, 1, 2 };
让我们构建两个版本,第一个是&#34;项目&#34;第二个是&#34;右边的下一个项目&#34;:
keys1: { 1, 1, 2, 3, 4, 4, 5, 5, 6, 6 };
values1: { 3, 2, 4, 1, 1, 1, 2, 4, 1, 1 };
keys2: { 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values2: { 2, 4, 1, 1, 1, 2, 4, 1, 1, 2 };
实际&#34;建设&#34;是微不足道的。 keys1只是[keys.begin(),keys.end() - 1),而keys2只是[keys.begin()+ 1,keys.end())。同样,对于values1和values2。
我们将keys1和values1压缩在一起,我们将keys2和values2压缩在一起。然后我们将这两个压缩实体传递给一个转换操作,该操作有一个特殊的仿函数,可以执行以下操作:
如果keys1 == keys2,对values1和values2数量执行所需的数学运算,并在标记数组中放置一个。如果不是,请在标记数组中放置0。此操作的输出将是:
keys: { 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values: { 6, 4, 1, 1, 1, 8, 4, 1, 1, 2 };
marker: { 1, 0, 0, 1, 0, 1, 0, 1, 0, 0 };
现在将上面的3个向量压缩在一起,并将其传递给remove_if。 remove_if仿函数将指示删除标记== 0的任何项目,并保留:
keys: { 1, 4, 5, 6 };
values: { 6, 1, 8, 1 };
marker: { 1, 1, 1, 1 };
这是一个完整的例子,展示了两种方法:
$ cat t1007.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/set_operations.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/merge.h>
#include <thrust/remove.h>
#include <assert.h>
struct mark_mpy_func
{
template <typename T1, typename T2>
__host__ __device__
int operator()(T1 &z1, T2 &z2){
int res = 0;
if (thrust::get<0>(z1) == thrust::get<0>(z2)){
res = thrust::get<1>(z1) * thrust::get<1>(z2);
thrust::get<2>(z1) = 1;}
return res;
}
};
struct mtest_func
{
__host__ __device__
bool operator()(int t){
if (t == 0) return true;
return false;
}
};
int main(){
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
// method 1
thrust::device_vector<int> Lkeysvo(Lsize);
thrust::device_vector<int> Lvalsvo(Lsize);
thrust::device_vector<int> Rkeysvo(Rsize);
thrust::device_vector<int> Rvalsvo(Rsize);
size_t Lsizeo = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Lkeysvo.begin(), Lvalsvo.begin()).first - Lkeysvo.begin();
size_t Rsizeo = thrust::set_intersection_by_key(Rkeysv.begin(), Rkeysv.end(), Lkeysv.begin(), Lkeysv.end(), Rvalsv.begin(), Rkeysvo.begin(), Rvalsvo.begin()).first - Rkeysvo.begin();
assert(Lsizeo == Rsizeo);
thrust::device_vector<int> res1(Lsizeo);
thrust::transform(Lvalsvo.begin(), Lvalsvo.begin()+Lsizeo, Rvalsvo.begin(), res1.begin(), thrust::multiplies<int>());
std::cout << "Method 1 result:" << std::endl << "keys: ";
thrust::copy_n(Lkeysvo.begin(), Lsizeo, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res1.begin(), Lsizeo, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
// method 2
thrust::device_vector<int> Mkeysv(Lsize + Rsize);
thrust::device_vector<int> Mvalsv(Lsize + Rsize);
thrust::merge_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Rvalsv.begin(), Mkeysv.begin(), Mvalsv.begin());
thrust::device_vector<int> marker(Lsize + Rsize - 1);
thrust::device_vector<int> res2(Lsize + Rsize - 1);
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), Mvalsv.begin(), marker.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, Mvalsv.end()-1, marker.end())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin()+1, Mvalsv.begin()+1)), res2.begin(), mark_mpy_func());
size_t rsize2 = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple( Mkeysv.begin(), res2.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, res2.end())), marker.begin(), mtest_func()) - thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), res2.begin()));
std::cout << "Method 2 result:" << std::endl << "keys: ";
thrust::copy_n(Mkeysv.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res2.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1007 t1007.cu
$ ./t1007
Method 1 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
Method 2 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
$
如果可以在输出数据中使用标记值(例如-1)来通知remove_if操作,则可以省去单独的标记数组。这是方法2的修改版本,以这种方式工作:
$ cat t1007.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/merge.h>
#include <thrust/remove.h>
#define MARK_VAL -1
struct mark_mpy_func
{
template <typename T1, typename T2>
__host__ __device__
int operator()(T1 &z1, T2 &z2){
int res = MARK_VAL;
if (thrust::get<0>(z1) == thrust::get<0>(z2)){
res = thrust::get<1>(z1) * thrust::get<1>(z2);}
return res;
}
};
struct mtest_func
{
template <typename T>
__host__ __device__
bool operator()(T t){
if (thrust::get<1>(t) == MARK_VAL) return true;
return false;
}
};
int main(){
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
// method 2
thrust::device_vector<int> Mkeysv(Lsize + Rsize);
thrust::device_vector<int> Mvalsv(Lsize + Rsize);
thrust::merge_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Rvalsv.begin(), Mkeysv.begin(), Mvalsv.begin());
thrust::device_vector<int> res2(Lsize + Rsize - 1);
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), Mvalsv.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, Mvalsv.end()-1)), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin()+1, Mvalsv.begin()+1)), res2.begin(), mark_mpy_func());
size_t rsize2 = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple( Mkeysv.begin(), res2.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, res2.end())), mtest_func()) - thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), res2.begin()));
std::cout << "Method 2 result:" << std::endl << "keys: ";
thrust::copy_n(Mkeysv.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res2.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1007 t1007.cu
$ ./t1007
Method 2 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
$
答案 1 :(得分:3)
您可以使用一次 thrust::set_intersection_by_key
来实际执行所有操作。
但是,需要满足一些先决条件:
首先,简单的一个:
您需要将压缩 Lvalsv
和Rvalsv
合并为一个thrust::zip_iterator
,并将其作为值传递给thrust::set_intersection_by_key
。
你已经可以运行了:
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector<int> result_keys(min_size);
thrust::device_vector<int> result_values_left(min_size);
thrust::device_vector<int> result_values_right(min_size);
auto zipped_input_values = thrust::make_zip_iterator(thrust::make_tuple(Lvalsv.begin(), Rvalsv.begin()));
auto zipped_output_values = thrust::make_zip_iterator(thrust::make_tuple(result_values_left.begin(), result_values_right.begin()));
auto result_pair = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), zipped_input_values, result_keys.begin(), zipped_output_values);
这将产生两个结果向量,您需要将元素相乘以获得最终结果。
但等等,如果你能避免将这两个向量作为结果存储,然后再次读取每个元素以将它们相乘,然后将最终结果存储在第三个向量中,那不是很好吗?
我们这样做。我改编的概念来自here。
transform_output_iterator
是一个迭代器,它是另一个OutputIterator
的包装器。写入transform_output_iterator
时,UnaryFunction
将应用于要写入的值,然后该结果将写入已包装的OutputIterator
。
这样我们就可以将结果从thrust::set_intersection_by_key
传递到Multiplier
仿函数,然后将其存储在一个result_values
向量中的结果中。
以下代码实现了这个想法:
#include <thrust/iterator/iterator_traits.h>
#include <thrust/iterator/iterator_facade.h>
#include <thrust/iterator/iterator_adaptor.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
#include <thrust/set_operations.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cstdint>
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
std::cout << std::endl;
}
template <typename OutputIterator, typename UnaryFunction>
class Proxy
{
UnaryFunction& fun;
OutputIterator& out;
public:
__host__ __device__
Proxy(UnaryFunction& fun, OutputIterator& out) : fun(fun), out(out) {}
template <typename T>
__host__ __device__
Proxy operator=(const T& x) const
{
*out = fun(x);
return *this;
}
};
// This iterator is a wrapper around another OutputIterator which
// applies a UnaryFunction before writing to the OutputIterator.
template <typename OutputIterator, typename UnaryFunction>
class transform_output_iterator : public thrust::iterator_adaptor<
transform_output_iterator<OutputIterator, UnaryFunction>
, OutputIterator
, thrust::use_default
, thrust::use_default
, thrust::use_default
, Proxy<const OutputIterator, const UnaryFunction> >
{
UnaryFunction fun;
public:
friend class thrust::iterator_core_access;
// shorthand for the name of the iterator_adaptor we're deriving from
typedef thrust::iterator_adaptor<
transform_output_iterator<OutputIterator, UnaryFunction>,
OutputIterator, thrust::use_default, thrust::use_default, thrust::use_default, Proxy<const OutputIterator, const UnaryFunction>
> super_t;
__host__ __device__
transform_output_iterator(OutputIterator out, UnaryFunction fun) : super_t(out), fun(fun)
{
}
private:
__host__ __device__
typename super_t::reference dereference() const
{
return Proxy<const OutputIterator, const UnaryFunction>(fun, this->base_reference());
}
};
struct Multiplier
{
template<typename Tuple>
__host__ __device__
auto operator()(Tuple t) const -> decltype(thrust::get<0>(t) * thrust::get<1>(t))
{
return thrust::get<0>(t) * thrust::get<1>(t);
}
};
template <typename OutputIterator, typename UnaryFunction>
transform_output_iterator<OutputIterator, UnaryFunction>
__host__ __device__
make_transform_output_iterator(OutputIterator out, UnaryFunction fun)
{
return transform_output_iterator<OutputIterator, UnaryFunction>(out, fun);
}
int main()
{
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector<int> result_keys(min_size);
thrust::device_vector<int> result_values(min_size);
auto zipped_values = thrust::make_zip_iterator(thrust::make_tuple(Lvalsv.begin(), Rvalsv.begin()));
auto output_it = make_transform_output_iterator(result_values.begin(), Multiplier());
auto result_pair = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), zipped_values, result_keys.begin(), output_it);
std::size_t new_size = result_pair.first - result_keys.begin();
result_keys.resize(new_size);
result_values.resize(new_size);
PRINTER(result_keys);
PRINTER(result_values);
}
<强>输出强>
$ nvcc -std=c++11 main.cu && ./a.out
result_keys: 1 4 5 6
result_values: 6 1 8 1
答案 2 :(得分:1)
我认为需要两套交叉点,如第一个答案所示。其他解决方案不起作用,输入数据只是巧合,它们产生了正确的结果。例如,如果从左侧集合中删除第二个(键,值)对,则计算结果将不同,而不应该是以下代码:
$ cat inner_join.cu
#include <thrust/set_operations.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <iostream>
int main()
{
int _Lkeys[] = {1, 4, 5, 6};
int _Lvals[] = {3, 1, 2, 1};
int _Rkeys[] = {1, 3, 4, 5, 6, 7};
int _Rvals[] = {2, 1, 1, 4, 1, 2};
size_t Lsize = sizeof(_Lkeys) / sizeof(int);
size_t Rsize = sizeof(_Rkeys) / sizeof(int);
thrust::device_vector<int> Lkeys(_Lkeys, _Lkeys + Lsize);
thrust::device_vector<int> Lvals(_Lvals, _Lvals + Lsize);
thrust::device_vector<int> Rkeys(_Rkeys, _Rkeys + Rsize);
thrust::device_vector<int> Rvals(_Rvals, _Rvals + Rsize);
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector<int> result_keys(min_size);
thrust::device_vector<int> result_Rvals(min_size);
thrust::device_vector<int> result_Lvals(min_size);
// set intersection keys, and left set values
size_t intersection_size =
thrust::set_intersection_by_key(Lkeys.begin(), Lkeys.end(), Rkeys.begin(),
Rkeys.end(), Lvals.begin(),
result_keys.begin(), result_Lvals.begin())
.first -
result_keys.begin();
// set intersection keys, and right set values
thrust::set_intersection_by_key(Rkeys.begin(), Rkeys.end(), Lkeys.begin(),
Lkeys.end(), Rvals.begin(),
result_keys.begin(), result_Rvals.begin());
result_Lvals.resize(intersection_size);
result_keys.resize(intersection_size);
thrust::device_vector<int> result_values(intersection_size);
// join left and right intersection values
thrust::transform(result_Lvals.begin(), result_Lvals.end(),
result_Rvals.begin(), result_values.begin(),
thrust::multiplies<int>());
std::cout << "keys: ";
thrust::copy_n(result_keys.begin(), intersection_size,
std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(result_values.begin(), intersection_size,
std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
<强>输出强>
$ nvcc inner_join.cu -run
keys: 1,4,5,6,
vals: 6,1,8,1,