推力:选择性地将元素移动到另一个向量

时间:2014-03-13 07:29:43

标签: stl cuda thrust

我正试图找出使用Thrust执行以下操作的最佳方法:vector A有一百万个浮点数,它们有一些特定的顺序。我想移动到向量B A中的每个元素 x x > 7.0 这样的顺序在向量A和B中都保持元素的数量。重要的是,只需要移动一小部分元素。对于我的代码而言,效率比优雅更重要。

我的想法是使用从A到B的thrust::copy_if然后在A.上使用thrust::remove_if但是我不知道要复制的元素的确切数量,并且因为显然B的内存必须如果事先分配,则需要进行另一次计算操作。跳过计数操作的一种不优雅的方法是为向量B预先分配“足够”的内存。

使用thrust::remove_copy_if有很多相同的问题:你需要提前为B分配内存,而且它实际上并没有从A中删除任何东西,所以无论如何都需要另一个thrust::remove_if

我的另一个想法是使用thrust::stable_sort和一些定制的比较函子,将我想要的所有元素推到A的末尾,然后以某种方式计算出有多少元素和{{1}他们到B.这也看起来很不优雅......

2 个答案:

答案 0 :(得分:4)

你使用thrust :: copy_if进入了正确的轨道。只需再分配两个与第一个相同大小的缓冲区。然后copy_if> 7.0f到第一个,copy_if< = 7.0f到第二个。只要你知道那里的空间,分配与原始缓冲区大小相同的缓冲区就可以了,100万个浮点数只需要4MB。

修改

我对copy_ifstable_partition方法进行了性能比较。在我的卡上,GTX660,stable_partition花了大约150%的时间copy_if用于"分割" 0.1f0.5f0.9f的值。我添加了测试以确保两种方法都是稳定的(维持值的顺序)。

#include <cuda.h>
#include <curand.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>

#include <iostream>
#include <cassert>

#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)


#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)


#define SPLIT  0.1f

struct is_low
{
  __host__ __device__ bool operator()(const float x)
  {
    return x <= SPLIT;
  }
};


struct is_high
{
  __host__ __device__ bool operator()(const float x)
  {
    return x > SPLIT;
  }
};


class EventTimer {
public:
  EventTimer() : mStarted(false), mStopped(false) {
    cudaEventCreate(&mStart);
    cudaEventCreate(&mStop);
  }
  ~EventTimer() {
    cudaEventDestroy(mStart);
    cudaEventDestroy(mStop);
  }
  void start(cudaStream_t s = 0) {
    cudaEventRecord(mStart, s); 
    mStarted = true;
    mStopped = false;
  }
  void stop(cudaStream_t s = 0)  {
    assert(mStarted);
    cudaEventRecord(mStop, s); 
    mStarted = false;
    mStopped = true;
  }
  float elapsed() {
    assert(mStopped);
    if (!mStopped) return 0; 
    cudaEventSynchronize(mStop);
    float elapsed = 0;
    cudaEventElapsedTime(&elapsed, mStart, mStop);
    return elapsed;
  }

private:
  bool mStarted, mStopped;
  cudaEvent_t mStart, mStop;
};


int main(int argc, char *argv[])
{
    const size_t n = 1024 * 1024 * 50;

    // Create prng
    curandGenerator_t gen;
    CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));

    // Set seed
    CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));

    // Generate n floats on device 
    thrust::device_vector<float> vec_rnd_d(n);
    float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
    CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));

    thrust::device_vector<float> vec_low_d(n);
    thrust::device_vector<float> vec_high_d(n);

    for (int i = 0; i < 5; ++i) {
      EventTimer timer;
      timer.start();
      thrust::device_vector<float>::iterator iter_end;
      iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
      thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
      timer.stop();
      std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;

      // check result
      thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
      thrust::host_vector<float> vec_low_h = vec_low_d;
      thrust::host_vector<float> vec_high_h = vec_high_d;
      thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
      thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
      for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
          rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
        if (*rnd_iter_h <= SPLIT) {
          assert(*low_iter_h == *rnd_iter_h);
          ++low_iter_h;
        }
        else {
          assert(*high_iter_h == *rnd_iter_h);
          ++high_iter_h;
        }
      }
    }

    for (int i = 0; i < 5; ++i) {
      thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
      EventTimer timer;
      timer.start();
      thrust::device_vector<float>::iterator iter_split = 
        thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
      timer.stop();
      size_t n_low = iter_split - vec_rnd_copy.begin();
      std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;

      // check result
      thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
      thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
      thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
      thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
      for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
          rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
        if (*rnd_iter_h <= SPLIT) {
          assert(*low_iter_h == *rnd_iter_h);
          ++low_iter_h;
        }
        else {
          assert(*high_iter_h == *rnd_iter_h);
          ++high_iter_h;
        }
      }
    }

    CHECK_CURAND_CALL(curandDestroyGenerator(gen));

    return EXIT_SUCCESS;
}

输出:

C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms

答案 1 :(得分:3)

为了回答我自己的问题,我终于找到了thrust::stable_partition,它比所有&#34; copy_if&#34; -alternatives更有效和优雅。它只是将所有不满足谓词的元素移动到数组的末尾,并返回第二个序列的开头。指针算术给出了B的大小,但实际上它不再是必需的:

thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred)
thrust::device_vector<float> B(iter, A.end())
A.erase(iter, A.end());