我正试图找出使用Thrust执行以下操作的最佳方法:vector A有一百万个浮点数,它们有一些特定的顺序。我想移动到向量B A中的每个元素 x x > 7.0 这样的顺序在向量A和B中都保持元素的数量。重要的是,只需要移动一小部分元素。对于我的代码而言,效率比优雅更重要。
我的想法是使用从A到B的thrust::copy_if
然后在A.上使用thrust::remove_if
但是我不知道要复制的元素的确切数量,并且因为显然B的内存必须如果事先分配,则需要进行另一次计算操作。跳过计数操作的一种不优雅的方法是为向量B预先分配“足够”的内存。
使用thrust::remove_copy_if
有很多相同的问题:你需要提前为B分配内存,而且它实际上并没有从A中删除任何东西,所以无论如何都需要另一个thrust::remove_if
。
我的另一个想法是使用thrust::stable_sort
和一些定制的比较函子,将我想要的所有元素推到A的末尾,然后以某种方式计算出有多少元素和{{1}他们到B.这也看起来很不优雅......
答案 0 :(得分:4)
你使用thrust :: copy_if进入了正确的轨道。只需再分配两个与第一个相同大小的缓冲区。然后copy_if> 7.0f到第一个,copy_if< = 7.0f到第二个。只要你知道那里的空间,分配与原始缓冲区大小相同的缓冲区就可以了,100万个浮点数只需要4MB。
修改强>
我对copy_if
和stable_partition
方法进行了性能比较。在我的卡上,GTX660,stable_partition
花了大约150%的时间copy_if
用于"分割" 0.1f
,0.5f
和0.9f
的值。我添加了测试以确保两种方法都是稳定的(维持值的顺序)。
#include <cuda.h>
#include <curand.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define SPLIT 0.1f
struct is_low
{
__host__ __device__ bool operator()(const float x)
{
return x <= SPLIT;
}
};
struct is_high
{
__host__ __device__ bool operator()(const float x)
{
return x > SPLIT;
}
};
class EventTimer {
public:
EventTimer() : mStarted(false), mStopped(false) {
cudaEventCreate(&mStart);
cudaEventCreate(&mStop);
}
~EventTimer() {
cudaEventDestroy(mStart);
cudaEventDestroy(mStop);
}
void start(cudaStream_t s = 0) {
cudaEventRecord(mStart, s);
mStarted = true;
mStopped = false;
}
void stop(cudaStream_t s = 0) {
assert(mStarted);
cudaEventRecord(mStop, s);
mStarted = false;
mStopped = true;
}
float elapsed() {
assert(mStopped);
if (!mStopped) return 0;
cudaEventSynchronize(mStop);
float elapsed = 0;
cudaEventElapsedTime(&elapsed, mStart, mStop);
return elapsed;
}
private:
bool mStarted, mStopped;
cudaEvent_t mStart, mStop;
};
int main(int argc, char *argv[])
{
const size_t n = 1024 * 1024 * 50;
// Create prng
curandGenerator_t gen;
CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
// Set seed
CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
// Generate n floats on device
thrust::device_vector<float> vec_rnd_d(n);
float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));
thrust::device_vector<float> vec_low_d(n);
thrust::device_vector<float> vec_high_d(n);
for (int i = 0; i < 5; ++i) {
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_end;
iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
timer.stop();
std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_low_h = vec_low_d;
thrust::host_vector<float> vec_high_h = vec_high_d;
thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
for (int i = 0; i < 5; ++i) {
thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_split =
thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
timer.stop();
size_t n_low = iter_split - vec_rnd_copy.begin();
std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
CHECK_CURAND_CALL(curandDestroyGenerator(gen));
return EXIT_SUCCESS;
}
输出:
C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms
答案 1 :(得分:3)
为了回答我自己的问题,我终于找到了thrust::stable_partition
,它比所有&#34; copy_if&#34; -alternatives更有效和优雅。它只是将所有不满足谓词的元素移动到数组的末尾,并返回第二个序列的开头。指针算术给出了B的大小,但实际上它不再是必需的:
thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());