Question

我希望在我的cuda应用程序中使用推力，并执行以下简单测试以查看thrust :: sort

的性能

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>

int main()
{
    int min = 1;
    int max = 1024*1024;
    int n = 1024*1024;

    thrust::host_vector<int> h_input(n);
    thrust::host_vector<int> h_keysSorted(n);

    //fill host input with random data
    for(int i=0; i<n; i++){
        h_input[i] = min + (rand() % (int)(max - min + 1));
    }

    thrust::device_vector<int> d_input(n);

    float elapsedTime;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0);
    d_input= h_input;
    thrust::sort(d_input.begin(), d_input.end());

    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    thrust::copy(d_input.begin(), d_input.end(), h_keysSorted.begin());
    std::cout<<"Elapsed time: "<<elapsedTime<<std::endl;
}

除了过长的编译和cuda上下文创建之外，上面的代码花了超过200ms来对我的gtx 770m上的1048576整数进行排序。这太可怕了。例如，paper表示对相同大小的数组进行排序时间不到2毫秒，我发现cpu timings的时间不到200毫秒。

我认为我做的事情显然是错的，但我看不出它是什么。有谁知道为什么推力需要这么长时间？我做错了什么？

为什么推力排序这么慢？

0 个答案: