Question

我正试图找出使用Thrust执行以下操作的最佳方法：vector A有一百万个浮点数，它们有一些特定的顺序。我想移动到向量B A中的每个元素 x x ＆gt; 7.0 这样的顺序在向量A和B中都保持元素的数量。重要的是，只需要移动一小部分元素。对于我的代码而言，效率比优雅更重要。

我的想法是使用从A到B的thrust::copy_if然后在A.上使用thrust::remove_if但是我不知道要复制的元素的确切数量，并且因为显然B的内存必须如果事先分配，则需要进行另一次计算操作。跳过计数操作的一种不优雅的方法是为向量B预先分配“足够”的内存。

使用thrust::remove_copy_if有很多相同的问题：你需要提前为B分配内存，而且它实际上并没有从A中删除任何东西，所以无论如何都需要另一个thrust::remove_if。

我的另一个想法是使用thrust::stable_sort和一些定制的比较函子，将我想要的所有元素推到A的末尾，然后以某种方式计算出有多少元素和{{1}他们到B.这也看起来很不优雅......

Answer 1

你使用thrust :: copy_if进入了正确的轨道。只需再分配两个与第一个相同大小的缓冲区。然后copy_if＆gt; 7.0f到第一个，copy_if＆lt; = 7.0f到第二个。只要你知道那里的空间，分配与原始缓冲区大小相同的缓冲区就可以了，100万个浮点数只需要4MB。

修改

我对copy_if和stable_partition方法进行了性能比较。在我的卡上，GTX660，stable_partition花了大约150％的时间copy_if用于＆＃34;分割＆＃34; 0.1f，0.5f和0.9f的值。我添加了测试以确保两种方法都是稳定的（维持值的顺序）。

#include <cuda.h> #include <curand.h> #include <thrust/device_vector.h> #include <thrust/host_vector.h> #include <thrust/copy.h> #include <thrust/partition.h> #include <iostream> #include <cassert> #define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \ printf("Error at %s:%d\n",__FILE__,__LINE__);\ return EXIT_FAILURE;}} while(0) #define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \ printf("Error at %s:%d\n",__FILE__,__LINE__);\ return EXIT_FAILURE;}} while(0) #define SPLIT 0.1f struct is_low { __host__ __device__ bool operator()(const float x) { return x <= SPLIT; } }; struct is_high { __host__ __device__ bool operator()(const float x) { return x > SPLIT; } }; class EventTimer { public: EventTimer() : mStarted(false), mStopped(false) { cudaEventCreate(&mStart); cudaEventCreate(&mStop); } ~EventTimer() { cudaEventDestroy(mStart); cudaEventDestroy(mStop); } void start(cudaStream_t s = 0) { cudaEventRecord(mStart, s); mStarted = true; mStopped = false; } void stop(cudaStream_t s = 0) { assert(mStarted); cudaEventRecord(mStop, s); mStarted = false; mStopped = true; } float elapsed() { assert(mStopped); if (!mStopped) return 0; cudaEventSynchronize(mStop); float elapsed = 0; cudaEventElapsedTime(&elapsed, mStart, mStop); return elapsed; } private: bool mStarted, mStopped; cudaEvent_t mStart, mStop; }; int main(int argc, char *argv[]) { const size_t n = 1024 * 1024 * 50; // Create prng curandGenerator_t gen; CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); // Set seed CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL)); // Generate n floats on device thrust::device_vector<float> vec_rnd_d(n); float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data()); CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n)); thrust::device_vector<float> vec_low_d(n); thrust::device_vector<float> vec_high_d(n); for (int i = 0; i < 5; ++i) { EventTimer timer; timer.start(); thrust::device_vector<float>::iterator iter_end; iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low()); thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high()); timer.stop(); std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl; // check result thrust::host_vector<float> vec_rnd_h = vec_rnd_d; thrust::host_vector<float> vec_low_h = vec_low_d; thrust::host_vector<float> vec_high_h = vec_high_d; thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin(); thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin(); for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin(); rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) { if (*rnd_iter_h <= SPLIT) { assert(*low_iter_h == *rnd_iter_h); ++low_iter_h; } else { assert(*high_iter_h == *rnd_iter_h); ++high_iter_h; } } } for (int i = 0; i < 5; ++i) { thrust::device_vector<float> vec_rnd_copy = vec_rnd_d; EventTimer timer; timer.start(); thrust::device_vector<float>::iterator iter_split = thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low()); timer.stop(); size_t n_low = iter_split - vec_rnd_copy.begin(); std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl; // check result thrust::host_vector<float> vec_rnd_h = vec_rnd_d; thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy; thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin(); thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low; for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin(); rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) { if (*rnd_iter_h <= SPLIT) { assert(*low_iter_h == *rnd_iter_h); ++low_iter_h; } else { assert(*high_iter_h == *rnd_iter_h); ++high_iter_h; } } } CHECK_CURAND_CALL(curandDestroyGenerator(gen)); return EXIT_SUCCESS; }

输出：

C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe copy_if: 40.2919ms copy_if: 38.0157ms copy_if: 38.5036ms copy_if: 37.6751ms copy_if: 38.1054ms stable_partition: 59.5473ms stable_partition: 61.4016ms stable_partition: 59.1854ms stable_partition: 61.3195ms stable_partition: 59.1205ms

Answer 2

为了回答我自己的问题，我终于找到了thrust::stable_partition，它比所有＆＃34; copy_if＆＃34; -alternatives更有效和优雅。它只是将所有不满足谓词的元素移动到数组的末尾，并返回第二个序列的开头。指针算术给出了B的大小，但实际上它不再是必需的：

thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());

推力：选择性地将元素移动到另一个向量

2 个答案: