Question

我是C ++和CUDA编码的新手，并编写了一个我希望并行化的程序，因为根据NSIGHT分析器，它目前只使用了25％的GPU。

下面，我编写了一个玩具程序来尝试使用thrust :: for_each（）来实现cuda流，但我似乎无法修改数组。我习惯于push :: transform（），在调用中提供了一个返回数组。

我似乎找到的所有示例都是使用for_each调用来打印，或者对修改后的数组不做任何操作。

当我运行这个程序时，它只返回一个零数组。

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <iterator>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/tuple.h>
#include <thrust/count.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <ctime>
#include <cstdio>
#include <cassert>

using namespace std;

//define typedef for iterators for shorthand
typedef thrust::device_vector<float>::iterator normIter;
typedef thrust::device_vector<float>::iterator deltaIter;
typedef thrust::device_vector<float>::iterator gammaskIter;
typedef thrust::device_vector<float>::iterator zetmaskIter;
typedef thrust::device_vector<float>::iterator zetvalIter;
//typedef thrust::zip_iterator<tpl2intiter>  idxzip;

//typedef a tuple of these iterators
typedef thrust::tuple<normIter, deltaIter, gammaskIter, zetmaskIter,     zetvalIter> IteratorTuple;

//typedef the zip_iterator for this tuple
typedef thrust::zip_iterator<IteratorTuple> ZipIterator;

//structure that takes takes the absolute value of a given number
template<typename T>
struct my_function
{
    cudaStream_t s;

    my_function(cudaStream_t s) : s(s) {}
    __host__ __device__ float operator()(thrust::tuple<float, float> x)
    {
        float y = thrust::get<0>(x);
        return thrust::get<1>(x) = y + 5;
    }
};

int main() {
    clock_t start;
    double duration;
    start = clock();


    thrust::device_vector<float> d_fraction(5);
    d_fraction[0] = 1;
    d_fraction[1] = 5;
    d_fraction[2] = 3;
    d_fraction[3] = 2;
    d_fraction[4] = 4;

    thrust::device_vector<float> d_fraction2(5);
    d_fraction2[0] = 0.00;
    d_fraction2[1] = 0.04;
    d_fraction2[2] = 0.08;
    d_fraction2[3] = 0.12;
    d_fraction2[4] = 0.16;

    cout << "original" << endl;
    int f = 0;
    while (f < 5){
        cout << d_fraction[f] << endl;
        f++;
    }
    cout << "original" << endl;
    int y = 0;
    while (y < 5){
        cout << d_fraction2[y] << endl;
        y++;
    }


    cudaStream_t s1, s2;
    cudaStreamCreate(&s1);
    cudaStreamCreate(&s2);

    thrust::device_vector<float> result1(5);
    thrust::device_vector<float> result2(5);

    thrust::for_each(thrust::cuda::par.on(s1), thrust::make_zip_iterator(thrust::make_tuple(d_fraction.begin(), result1.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_fraction.end(), result1.end())), my_function<float>(s1));

    thrust::for_each(thrust::cuda::par.on(s2), thrust::make_zip_iterator(thrust::make_tuple(d_fraction2.begin(), result2.begin())),
    thrust::make_zip_iterator(thrust::make_tuple(d_fraction2.end(), result2.end())), my_function<float>(s2));


    cudaStreamSynchronize(s1);
    cudaStreamSynchronize(s2);

    cout << "norm" << endl;
    int i = 0;
    while (i < 5){
        cout << result1[i] << endl;
        i++;
    }
    cout << "dut" << endl;
    int a = 0;
    while (a < 5){
        cout << result2[a] << endl;
        a++;
    }
    cudaStreamDestroy(s1);
    cudaStreamDestroy(s2);

    duration = (clock() - start) / (double)CLOCKS_PER_SEC;
    cout << "time " << duration << endl;
    cin.get();
    return 0;
}

Answer 1

使用thrust::transform，仿函数运算符的返回值是分配给输出迭代器的值。正如您所发现的那样，使用thrust::for_each（只接受输入迭代器），情况并非如此。

因此，当我们想要使用thrust::for_each修改向量的元素（而不是仅仅打印出来的东西）时，我们必须使用提供的元组（对函数运算符的输入）作为我们的路径。

您可能一直在尝试这样做：

    return thrust::get<1>(x) = y + 5;

但是这不起作用，因为函数运算符的返回值（在这种情况下是float所以也不匹配迭代器解引用类型，这是一个元组）不用于此目的，并且修改输入x元组的效果没有达到预期的效果，因为你的公式通过值将元组传递给了函数：

__host__ __device__ float operator()(thrust::tuple<float, float> x) ^ tuple passed by value

在C（或C ++）中，当我们通过值传递参数时，对该参数的修改将不会显示在调用环境中。通常的解决方案是通过引用传递，以便在仿函数中进行的修改将显示在调用环境中（即在thrust::for_each的输入向量中）。

如果我们按照以下方式尝试这样做：

__host__ __device__ float operator()(thrust::tuple<float, float> &x)

我们得到一些有用的编译错误：

$ nvcc t1188.cu -o t1188 /usr/local/cuda/bin/..//include/thrust/detail/function.h(60): error: function "my_function<T>::operator() [with T=float]" cannot be called with the given argument list argument types are: (thrust::detail::tuple_of_iterator_references<float &, float &, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>) NOTE ^^^^^^^^^^^^^^^^

并希望引导我们采用这种方式：

__host__ __device__ float operator()(thrust::tuple<float &, float &> x)

有效并具有预期的效果：

$ ./t1188 original 1 5 3 2 4 original 0 0.04 0.08 0.12 0.16 norm 6 10 8 7 9 dut 5 5.04 5.08 5.12 5.16 time 0.488422

我注意到的其他一些事情：

通过将stream参数传递给仿函数，我不确定您要完成的任务。在仿函数中你无法用它做任何事情，因此没有必要。

您的my_function仿函数前面有一个模板定义，没有任何用处（模板类型T在我可以看到的仿函数中没有使用过）。也许这只是你的编码工作遗留下来的。有时可以通过模板化函数的输入参数的类型来避免在输入参数上记住这个引用元组配置，但是你似乎没有这样做。

使用thrust :: for_each（）调用实现CUDA流时访问结构的结果

1 个答案: