我有3
个相同大小的数组(超过300.000
个元素)。一个浮点数和两个索引数组。因此,对于每个号码,我都有2
个ID。
所有3
数组都已存在于GPU全局内存中。我想相应地将所有数字与他们的ID排序。
有什么方法可以使用Thrust库来完成这项任务吗?有没有比推力图书馆更好的方法?
当然,我不想将它们复制到主机内存中几次。顺便说一下,它们是数组而不是向量。
提前感谢您的帮助。
暂定解决方案,但这非常慢。这需要几乎4
秒,我的数组大小按300000
thrust::device_ptr<float> keys(afterSum);
thrust::device_ptr<int> vals0(d_index);
thrust::device_ptr<int> vals1(blockId);
thrust::device_vector<int> sortedIndex(numElements);
thrust::device_vector<int> sortedBlockId(numElements);
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(numElements);
thrust::copy(iter, iter + indices.size(), indices.begin());
thrust::sort_by_key(keys, keys + numElements , indices.begin());
thrust::gather(indices.begin(), indices.end(), vals0, sortedIndex.begin());
thrust::gather(indices.begin(), indices.end(), vals1, sortedBlockId.begin());
thrust::host_vector<int> h_sortedIndex=sortedIndex;
thrust::host_vector<int> h_sortedBlockId=sortedBlockId;
答案 0 :(得分:11)
当然你可以使用Thrust。首先,您需要使用thrust::device_ptr
包装原始CUDA设备指针。假设您的浮点值在数组pkeys
中,并且ID在数组pvals0
和pvals1
中,并且numElements是数组的长度,这样的东西应该起作用:
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
thrust::device_ptr<float> keys(pkeys);
thrust::device_ptr<int> vals0(pvals0);
thrust::device_ptr<int> vals1(pvals1);
// allocate space for the output
thrust::device_vector<int> sortedVals0(numElements);
thrust::device_vector<int> sortedVals1(numElements);
// initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(numElements);
thrust::copy(iter, iter + indices.size(), indices.begin());
// first sort the keys and indices by the keys
thrust::sort_by_key(keys.begin(), keys.end(), indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), vals0.begin(), sortedVals0.begin());
thrust::gather(indices.begin(), indices.end(), vals1.begin(), sortedVals1.begin());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Took %f milliseconds for %d elements\n", milliseconds, numElements);
答案 1 :(得分:3)
我比较了上面提出的两种方法,即使用thrust::zip_iterator
和使用thrust::gather
的方法。我按照海报的要求,在按键或三个数组排序两个数组的情况下对它们进行了测试。在所有这两种情况下,使用thrust::gather
的方法显示更快。
2
阵列
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include "TimingGPU.cuh"
//#define VERBOSE
//#define COMPACT
int main() {
const int N = 1048576;
//const int N = 10;
TimingGPU timerGPU;
// --- Initialize random seed
srand(time(NULL));
thrust::host_vector<int> h_code(N);
thrust::host_vector<double> h_x(N);
thrust::host_vector<double> h_y(N);
for (int k = 0; k < N; k++) {
// --- Generate random numbers between 0 and 9
h_code[k] = rand() % 10 + 1;
h_x[k] = ((double)rand() / (RAND_MAX));
h_y[k] = ((double)rand() / (RAND_MAX));
}
thrust::device_vector<int> d_code(h_code);
thrust::device_vector<double> d_x(h_x);
thrust::device_vector<double> d_y(h_y);
#ifdef VERBOSE
printf("Before\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
timerGPU.StartCounter();
#ifdef COMPACT
thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin())));
#else
// --- Initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(N);
thrust::copy(iter, iter + indices.size(), indices.begin());
// --- First, sort the keys and indices by the keys
thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin());
thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin());
#endif
printf("Timing GPU = %f\n", timerGPU.GetCounter());
#ifdef VERBOSE
h_code = d_code;
h_x = d_x;
h_y = d_y;
printf("After\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
}
3
阵列
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include "TimingGPU.cuh"
//#define VERBOSE
//#define COMPACT
int main() {
const int N = 1048576;
//const int N = 10;
TimingGPU timerGPU;
// --- Initialize random seed
srand(time(NULL));
thrust::host_vector<int> h_code(N);
thrust::host_vector<double> h_x(N);
thrust::host_vector<double> h_y(N);
thrust::host_vector<double> h_z(N);
for (int k = 0; k < N; k++) {
// --- Generate random numbers between 0 and 9
h_code[k] = rand() % 10 + 1;
h_x[k] = ((double)rand() / (RAND_MAX));
h_y[k] = ((double)rand() / (RAND_MAX));
h_z[k] = ((double)rand() / (RAND_MAX));
}
thrust::device_vector<int> d_code(h_code);
thrust::device_vector<double> d_x(h_x);
thrust::device_vector<double> d_y(h_y);
thrust::device_vector<double> d_z(h_z);
#ifdef VERBOSE
printf("Before\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
timerGPU.StartCounter();
#ifdef COMPACT
thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin(), d_z.begin())));
#else
// --- Initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(N);
thrust::copy(iter, iter + indices.size(), indices.begin());
// --- First, sort the keys and indices by the keys
thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin());
thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin());
thrust::gather(indices.begin(), indices.end(), d_z.begin(), d_z.begin());
#endif
printf("Timing GPU = %f\n", timerGPU.GetCounter());
#ifdef VERBOSE
h_code = d_code;
h_x = d_x;
h_y = d_y;
printf("After\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
}
2
N = 1048576
数组的时间安排
zip_iterator = 7.34ms
gather = 4.27ms
3
N = 1048576
数组的时间安排
zip_iterator = 9.64ms
gather = 4.22ms
在NVIDIA GTX 960卡上进行测试。
答案 2 :(得分:2)
我会使用zip_iterator在两个indice矢量上同时执行一个sort_by_key。
这看起来像这样:
typedef typename thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> IteratorTuple;
typedef typename thrust::zip_iterator<IteratorTuple> ZipIterator;
// here I suppose your 3 arrays are pointed to by device_ptr as suggested by @harrism
thrust::device_vector<float> key(pKey, pKey + numElements);
thrust::device_vector<int> val0(pVal0, pVal0 + numElements);
thrust::device_vector<int> val1(pVal1, pVal1 + numElements);
ZipIterator iterBegin(thrust::make_tuple(val0.begin(), val1.begin()));
thrust::sort_by_key(key.begin(), key.end(), iterBegin);