如何在CUDA中使用std :: vector并不明显,所以我设计了自己的Vector类:


#include <cmath>
#include <iostream>
#include <cassert>

template <typename T>
class Vector
   T* mData;   // data stored in vector
   int mSize;  // size of vector
        Vector(const Vector& otherVector);  // Constructor
        Vector(int size);   // Constructor
        ~Vector();   // Desructor

        __host__ __device__ int GetSize() const; // get size of the vector

        T& operator[](int i);  // see element

        // change element i
        __host__ __device__ void set(size_t i, T value) {
                mData[i] = value;

        template <class S>    // output vector
        friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);

// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
   mSize = otherVector.GetSize();
   mData = new T [mSize];
   for (int i=0; i<mSize; i++)
      mData[i] = otherVector.mData[i];

// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
   assert(size > 0);
   mSize = size;
   mData = new T [mSize];
   for (int i=0; i<mSize; i++)
      mData[i] = 0.0;

// Overridden destructor to correctly free memory
template <typename T>
   delete[] mData;

// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
   return mSize;

// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
        assert(i > -1);
        assert(i < mSize);
        return mData[i];

// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
   assert(mSize == otherVector.mSize);
   for (int i=0; i<mSize; i++)
      mData[i] = otherVector.mData[i];
   return *this;

// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
   for (int i=0; i<v.mSize; i++) {
      output << v[i] << "   ";
  return output;

我的主要功能 - 我只是将一个向量传递给设备,修改它并将其传回 - 如下(内核设计仅用于测试目的):

#include <iostream>

#include "Vector.hpp"

__global__ void alpha(Vector<int>* d_num)
        int myId = threadIdx.x + blockDim.x * blockIdx.x;


int main()
        Vector<int> num(10);

        for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9

        std::cout << "Size of vector: " << num.GetSize() << "\n";
        std::cout << num << "\n"; // print vector

        Vector<int>* d_num;

        // allocate global memory on the device
        cudaMalloc((void **) &d_num, num.GetSize()*sizeof(int));

        // copy data from host memory to the device memory
        cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);

        // launch the kernel

        // copy the modified array back to the host, overwriting the contents of h_arr
        cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

        std::cout << num << "\n";

        // free GPU memory allocation and exit

        return 0;


我该如何处理? (请明确指出,我对CUDA很新。)

    cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);


    cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);


    cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);


由于d_num不是Vector(因为您已经分配了它 - 它是指向一组int数量的裸指针),因此您对Vector方法的使用在内核中也被打破了。如果要在内核中使用Vector方法,则需要将实际的Vector对象传递给它,而不仅仅是数据。由于传递对象将需要对象内的设备数据处理(主机上可访问的数据无法在设备上访问,反之亦然),因此它是对Vector类的广泛重写。我已经做了有限的尝试,展示了一种可能的前进方向。基本方法(即一种可能的方法)如下:

  1. 该对象将包含指向数据主机副本和数据设备副本的指针。
  2. 在对象实例化时,我们将分配两者,并初步设置我们的&#34;引用&#34;指向主机副本的指针。
  3. 在设备上使用之前,我们必须将主机数据复制到设备数据,并且to_device()方法用于此目的。这种方法也可以切换我们的参考&#34;指针(mData)用于引用Vector数据的设备端副本。
  4. 除了将主机数据复制到设备数据&#34;内部&#34;对于对象,我们必须使对象本身在设备上可用。为此,我们通过指针将对象本身复制到设备端副本(d_num)。
  5. 然后我们可以在设备上以通常的方式使用该对象,用于那些具有__device__装饰的方法。
  6. 完成内核后,我们必须更新数据的主机副本并切换我们的&#34;引用&#34;指针返回主机数据。为此目的提供了to_host()方法。
  7. 此后,可以在主机代码中再次使用该对象,反映内核中发生的任何数据更改。
  8. 这是一个有效的例子:

    $ cat t101.cu
    #include <iostream>
    #include <cmath>
    #include <iostream>
    #include <cassert>
    template <typename T>
    class Vector
       T* mData, *hData, *dData;   // data stored in vector
       int mSize;  // size of vector
            Vector(const Vector& otherVector);  // Constructor
            Vector(int size);   // Constructor
            ~Vector();   // Desructor
            __host__ __device__ int GetSize() const; // get size of the vector
            __host__ __device__ T& operator[](int i);  // see element
            // change element i
            __host__ __device__ void set(size_t i, T value) {
                    mData[i] = value;
            __host__ __device__ Vector<T>& operator=(const Vector<T>& otherVector);
            void to_device();
            void to_host();
            template <class S>    // output vector
            friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
    // Overridden copy constructor
    // Allocates memory for new vector, and copies entries of other vector into it
    template <typename T>
    Vector<T>::Vector(const Vector& otherVector)
       mSize = otherVector.GetSize();
       hData = new T [mSize];
       cudaMalloc(&dData, mSize*sizeof(T));
       mData = hData;
       for (int i=0; i<mSize; i++)
          mData[i] = otherVector.mData[i];
    // Constructor for vector of a given size
    // Allocates memory, and initialises entries to zero
    template <typename T>
    Vector<T>::Vector(int size)
       assert(size > 0);
       mSize = size;
       hData = new T [mSize];
       cudaMalloc(&dData, mSize*sizeof(T));
       mData = hData;
       for (int i=0; i<mSize; i++)
          mData[i] = 0.0;
    // Overridden destructor to correctly free memory
    template <typename T>
       delete[] hData;
       if (dData) cudaFree(dData);
    // Method to get the size of a vector
    template <typename T>
    __host__ __device__
    int Vector<T>::GetSize() const
       return mSize;
    // Overloading square brackets
    // Note that this uses `zero-based' indexing, and a check on the validity of the index
    template <typename T>
    __host__ __device__
    T& Vector<T>::operator[](int i)
            assert(i > -1);
            assert(i < mSize);
            return mData[i];
    // Overloading the assignment operator
    template <typename T>
    __host__ __device__
    Vector<T>& Vector<T>::operator=(const Vector<T>& otherVector)
       assert(mSize == otherVector.mSize);
       for (int i=0; i<mSize; i++)
          mData[i] = otherVector.mData[i];
       return *this;
    // Overloading the insertion << operator
    // not callable on the device!
    template <typename T>
    std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
       for (int i=0; i<v.mSize; i++) {
          output << v[i] << "   ";
      return output;
    template <typename T>
    void Vector<T>::to_device(){
      cudaMemcpy(dData, hData, mSize*sizeof(T), cudaMemcpyHostToDevice);
      mData = dData;
    template <typename T>
    void Vector<T>::to_host(){
      cudaMemcpy(hData, dData, mSize*sizeof(T), cudaMemcpyDeviceToHost);
      mData = hData;
    __global__ void alpha(Vector<int> *d_num)
            (*d_num)[1] = 50;
    int main()
            Vector<int> num(10);
            for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
            std::cout << "Size of vector: " << num.GetSize() << "\n";
            std::cout << num << "\n"; // print vector
            Vector<int> *d_num;
            cudaMalloc(&d_num, sizeof(Vector<int>));
            cudaMemcpy(d_num, &(num), sizeof(Vector<int>), cudaMemcpyHostToDevice);
            // launch the kernel
            // copy the modified array back to the host, overwriting the contents of h_arr
            std::cout << num << "\n";
            // free GPU memory allocation and exit
            return 0;
    $ nvcc -arch=sm_61 -o t101 t101.cu
    $ cuda-memcheck ./t101
    ========= CUDA-MEMCHECK
    Size of vector: 10
    0   1   2   3   4   5   6   7   8   9
    100   50   11   3   4   5   6   7   8   9
    ========= ERROR SUMMARY: 0 errors


    1. 根据我的测试,您发布的代码存在各种编译错误,因此我必须对您的Vector类进行其他更改才能将其编译。

    2. 按值将对象传递给内核将调用复制构造函数,然后调用析构函数,这会使事情变得更加困难,因此我选择通过指针传递对象(这是你原来的方式) ,以避免这种情况。

    3. 您的内核调用正在启动100个线程。由于他们都做了完全相同的事情,没有任何阅读活动,这没有什么特别的错误,但我已经把它改成了一个单一的线程。它仍然表现出相同的能力。

Vector<int> num(10);
Vector<int>* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));



cudaMalloc(&d_num, sizeof(Vector<int>));
cudaMalloc(&d_num->mData, num.GetSize()*sizeof(int)); // assume mData is a public attribute

不幸的是,第二行将发出segmentation fault,因为您正在从主机代码(d_num->mData)访问设备内存。


int* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// .. some kernel operations
cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

Thrust是为CUDA编写的库,它有向量。 http://docs.nvidia.com/cuda/thrust/ 也许它具有你需要的所有功能,所以如果你不需要,为什么要重新发明轮子。