Question

我有5000x500矩阵，我想用cuda分别对每行进行排序。我可以使用arrayfire，但这只是关于thrust :: sort的for循环，这应该没有效率。

https://github.com/arrayfire/arrayfire/blob/devel/src/backend/cuda/kernel/sort.hpp

for(dim_type w = 0; w < val.dims[3]; w++) {
            dim_type valW = w * val.strides[3];
            for(dim_type z = 0; z < val.dims[2]; z++) {
                dim_type valWZ = valW + z * val.strides[2];
                for(dim_type y = 0; y < val.dims[1]; y++) {

                    dim_type valOffset = valWZ + y * val.strides[1];

                    if(isAscending) {
                        thrust::sort(val_ptr + valOffset, val_ptr + valOffset + val.dims[0]);
                    } else {
                        thrust::sort(val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                     thrust::greater<T>());
                    }
                }
            }
        }

有没有办法融合推力操作，以便排序并行？实际上，我正在寻找的是将循环迭代融合到一起的通用方法。

Answer 1

我可以想到两种可能性，其中一种可能是由@JaredHoberock提出的。我不知道在推力中融合for循环迭代的一般方法，但第二种方法是更通用的方法。我的猜测是，在这种情况下，第一种方法将是两种方法中较快的方法。

使用矢量化排序。如果要由嵌套for循环排序的区域不重叠，则可以使用2 here中所讨论的背对背稳定排序操作进行矢量化排序。
Thrust v1.8（可通过CUDA 7 RC获得，或通过直接下载thrust github repository包括support for nesting thrust algorithms，通过在自定义仿函数中包含推力算法调用传递给另一推力算法如果您使用thrust::for_each操作来选择您需要执行的各个排序，则可以通过在传递给{{的仿函数中包含thrust::sort操作来使用单推力算法调用来运行这些排序。 1}}。

以下是3种方法之间的完全比较：

原始的循环排序方法
矢量化/批量排序
嵌套排序

在每种情况下，我们都会对每组1000个整数的16000组进行排序。

thrust::for_each

注意：

这些结果因GPU而异。
“嵌套”时间/方法在可支持动态并行性的GPU上可能会有很大差异，因为这将影响推力运行嵌套排序函数的方式。要使用动态并行性进行测试，请将编译开关从$ cat t617.cu #include <thrust/device_vector.h> #include <thrust/device_ptr.h> #include <thrust/host_vector.h> #include <thrust/sort.h> #include <thrust/execution_policy.h> #include <thrust/generate.h> #include <thrust/equal.h> #include <thrust/sequence.h> #include <thrust/for_each.h> #include <iostream> #include <stdlib.h> #define NSORTS 16000 #define DSIZE 1000 int my_mod_start = 0; int my_mod(){ return (my_mod_start++)/DSIZE; } bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){ return thrust::equal(d1.begin(), d1.end(), d2.begin()); } struct sort_functor { thrust::device_ptr<int> data; int dsize; __host__ __device__ void operator()(int start_idx) { thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1))); } }; #include <time.h> #include <sys/time.h> #define USECPSEC 1000000ULL unsigned long long dtime_usec(unsigned long long start){ timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start; } int main(){ cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS)); thrust::host_vector<int> h_data(DSIZE*NSORTS); thrust::generate(h_data.begin(), h_data.end(), rand); thrust::device_vector<int> d_data = h_data; // first time a loop thrust::device_vector<int> d_result1 = d_data; thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data()); unsigned long long mytime = dtime_usec(0); for (int i = 0; i < NSORTS; i++) thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE)); cudaDeviceSynchronize(); mytime = dtime_usec(mytime); std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl; //vectorized sort thrust::device_vector<int> d_result2 = d_data; thrust::host_vector<int> h_segments(DSIZE*NSORTS); thrust::generate(h_segments.begin(), h_segments.end(), my_mod); thrust::device_vector<int> d_segments = h_segments; mytime = dtime_usec(0); thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin()); thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin()); cudaDeviceSynchronize(); mytime = dtime_usec(mytime); std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl; if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl; //nested sort thrust::device_vector<int> d_result3 = d_data; sort_functor f = {d_result3.data(), DSIZE}; thrust::device_vector<int> idxs(NSORTS); thrust::sequence(idxs.begin(), idxs.end()); mytime = dtime_usec(0); thrust::for_each(idxs.begin(), idxs.end(), f); cudaDeviceSynchronize(); mytime = dtime_usec(mytime); std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl; if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl; return 0; } $ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu $ ./t617 loop time: 8.51577s vectorized time: 0.068802s nested time: 0.567959s $更改为-arch=sm_20
此代码需要CUDA 7 RC。我用了Fedora 20.
嵌套排序方法也将从设备端分配，因此我们必须使用-arch=sm_35 -rdc=true -lcudadevrt大幅增加设备分配堆。
如果您使用的是动态并行，并且根据您运行的GPU类型，cudaDeviceSetLimit保留的内存量可能需要增加8倍。

如何使用Thrust来排序矩阵的行？

1 个答案: