Question

这是我的内核调用代码

inline void find_min_max(thrust::device_vector<Npp8u> dev_vec, Npp8u *min, Npp8u *max){
    thrust::pair<thrust::device_vector<Npp8u>::iterator,thrust::device_vector<Npp8u>::iterator> tuple;
    tuple = thrust::minmax_element(dev_vec.begin(),dev_vec.end());
    *min = *(tuple.first);
    *max = *tuple.second;
}

我还使用map-reduce范例和简单的CPU代码对我的原始CUDA内核实现相同的算法。作为测量的结果，我看到推力是最慢的。

为简洁起见，我使用事件来测量原始CUDA和推力代码。如果事件适用于推力基准测试，我很确定我能正确测量执行时间。

这是测量部分;

    ....
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    thrust::device_vector<Npp8u> image_dev(imageHost, imageHost+N);

    // Device vector allocation
    find_min_max(image_dev,&min,&max);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime1;
    cudaEventElapsedTime(&elapsedTime1, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    totalTime1 = elapsedTime1/1000
....

我真正的问题是，除了推力中的简单minmax_element函数之外，是否有可能采用更好的方法？

我的机器规格：这是配备GeForce 635M和i7处理器的华硕k55v笔记本电脑。

Thrust code和CPU code

的所有代码

Answer 1

您没有显示任何与推力相比较的代码，您没有提供任何机器规格（GPU，CPU等），而且您还没有告诉我们实际测量的时间是多少

然而，我接受了你的代码并创建了一个测试用例，比较推力与STL（因为你没有显示你的CPU代码或任何其他实现）：

#include <stdio.h>
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <algorithm>
#include <time.h>

#define N 1000000
#define LOOPS 1000

inline void find_min_max(thrust::device_vector<int> &dev_vec, int *min, int *max){
    thrust::pair<thrust::device_vector<int>::iterator,thrust::device_vector<int>::iterator> tuple;
    tuple = thrust::minmax_element(dev_vec.begin(),dev_vec.end());
    *min = *(tuple.first);
    *max = *tuple.second;
}


int main(){
    int minele, maxele;

    std::vector<int> a;
    for (int i=0; i<N; i++)
      a.push_back(rand());
    thrust::host_vector<int> h_a(N);
    thrust::copy(a.begin(), a.end(), h_a.begin());

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    for (int i=0; i < LOOPS; i++){
      thrust::device_vector<int> d_a = h_a;
      find_min_max(d_a,&minele,&maxele);
      }
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime1, totalTime1;
    cudaEventElapsedTime(&elapsedTime1, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    totalTime1 = elapsedTime1/(1000*LOOPS);
    printf("thrust min element = %d, max element = %d\n", minele, maxele);
    printf("thrust time = %f\n", totalTime1);

    clock_t t;
    t = clock();
    std::vector<int>::iterator resultmax, resultmin;
    for (int i = 0; i<LOOPS; i++){
      resultmax = std::max_element(a.begin(), a.end());
      resultmin = std::min_element(a.begin(), a.end());
      }
    t = clock() - t;
    printf("STL min element = %d, max element = %d\n", *resultmin, *resultmax);
    printf("STL time = %f\n", ((float)t)/(CLOCKS_PER_SEC*LOOPS));
  return 0;
}

我使用CUDA 5.0，RHEL 5.5，Xeon X5560 2.8GHz CPU和Quadro 5000 GPU编译了这个代码，这是一个cc 2.0设备，比M2050慢一些（11个SM而不是14个），这些是结果：

thrust min element = 1210, max element = 2147480021
thrust time = 0.001741
STL min element = 1210, max element = 2147480021
STL time = 0.004520

即使我们考虑到我在STL中使用2个函数调用以获得最小值和最大值，（知道c ++ 11标准包含单个minmax函数调用）并将STL时间减半，推力更快。

如果您想讨论为什么您的情况可能特殊，请提供完整的，可编辑的简单比较代码，类似于我提供的代码，以及您的机器规格和实际的时间结果。

作为次要优化评论，如果您通过引用（＆amp;）而不是按值将device_vector传递给find_min_max函数，则会更快地运行。

在我的情况下，如果我采取主机 - ＆gt; device_vector复制出定时循环，我的推力时间从0.001741秒下降到0.000387秒，表明主机 - >设备复制约占总推力时间的78％。

编辑现在你已经发布了你的代码（尽管你没有提到你得到的时间）我用512x512 lena灰度图像运行它，并在我的设置上得到以下结果：

$ ./cpu
        Version: P5
        Comment: # Created by Imlib
        Width: 512 Height: 512
        Max value: 255
ELAPSED TIME -AVG finding max and min: 0.0014437
ELAPSED TIME -AVG finding max and min: 0.0038715
$ ./thr
Load PGM file.
        Version: P5
        Comment: # Created by Imlib
        Width: 512 Height: 512
        Max value: 255
ELAPSED TIME -AVG for kernel 1: 0.000658944
ELAPSED TIME -AVG for kernel 2: 0.000179552
$

所以在我看来，即使你的代码，我的设置上的推力也会更快。

查找数组的最大值和最小值时，推力是如此之慢？

1 个答案: