Question

如何在CUDA中使用std :: vector并不明显，所以我设计了自己的Vector类：

#ifndef VECTORHEADERDEF
#define VECTORHEADERDEF

#include <cmath>
#include <iostream>
#include <cassert>

template <typename T>
class Vector
{
private:
   T* mData;   // data stored in vector
   int mSize;  // size of vector
public:
        Vector(const Vector& otherVector);  // Constructor
        Vector(int size);   // Constructor
        ~Vector();   // Desructor

        __host__ __device__ int GetSize() const; // get size of the vector

        T& operator[](int i);  // see element

        // change element i
        __host__ __device__ void set(size_t i, T value) {
                mData[i] = value;
        }

        template <class S>    // output vector
        friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};


// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
   mSize = otherVector.GetSize();
   mData = new T [mSize];
   for (int i=0; i<mSize; i++)
   {
      mData[i] = otherVector.mData[i];
   }
}   

// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
   assert(size > 0);
   mSize = size;
   mData = new T [mSize];
   for (int i=0; i<mSize; i++)
   {
      mData[i] = 0.0;
   }
}

// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
   delete[] mData;
}

// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
{
   return mSize;
}

// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
{
        assert(i > -1);
        assert(i < mSize);
        return mData[i];
}

// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
{
   assert(mSize == otherVector.mSize);
   for (int i=0; i<mSize; i++)
   {
      mData[i] = otherVector.mData[i];
   }
   return *this;
}

// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
   for (int i=0; i<v.mSize; i++) {
      output << v[i] << "   ";
   }
  return output;
}

我的主要功能 - 我只是将一个向量传递给设备，修改它并将其传回 - 如下（内核设计仅用于测试目的）：

#include <iostream>

#include "Vector.hpp"


__global__ void alpha(Vector<int>* d_num)
{
        int myId = threadIdx.x + blockDim.x * blockIdx.x;


        d_num->set(0,100);
        d_num->set(2,11);
}


int main()
{
        Vector<int> num(10);

        for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9

        std::cout << "Size of vector: " << num.GetSize() << "\n";
        std::cout << num << "\n"; // print vector

        Vector<int>* d_num;

        // allocate global memory on the device
        cudaMalloc((void **) &d_num, num.GetSize()*sizeof(int));

        // copy data from host memory to the device memory
        cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);


        // launch the kernel
        alpha<<<1,100>>>(d_num);


        // copy the modified array back to the host, overwriting the contents of h_arr
        cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

        std::cout << num << "\n";


        // free GPU memory allocation and exit
        cudaFree(d_num);

        return 0;
}

我遇到的问题是cudaMemcpyDeviceToHost。从输出中可以看出，它并没有真正将设备向量复制到num向量。

我该如何处理？（请明确指出，我对CUDA很新。）

Answer 1

这将创建一个指向向量num的第一个元素的有效指针：

    cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
                      ^^^^^^^

这不会：

    cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
               ^^^

您的Vector对象的名称不是指向其第一个数据元素的指针。相反，您应该以与您编写的第一个类似的方式编写该行，如下所示：

    cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

然而，这本身并不是一个解决方法。请注意d_num不是Vector，但已经是指针，因此我们可以直接在这些操作中使用它。虽然使用&(d_num[0])没有错，但没有必要这样做。

由于d_num不是Vector（因为您已经分配了它 - 它是指向一组int数量的裸指针），因此您对Vector方法的使用在内核中也被打破了。如果要在内核中使用Vector方法，则需要将实际的Vector对象传递给它，而不仅仅是数据。由于传递对象将需要对象内的设备数据处理（主机上可访问的数据无法在设备上访问，反之亦然），因此它是对Vector类的广泛重写。我已经做了有限的尝试，展示了一种可能的前进方向。基本方法（即一种可能的方法）如下：

该对象将包含指向数据主机副本和数据设备副本的指针。
在对象实例化时，我们将分配两者，并初步设置我们的＆＃34;引用＆＃34;指向主机副本的指针。
在设备上使用之前，我们必须将主机数据复制到设备数据，并且to_device()方法用于此目的。这种方法也可以切换我们的参考＆＃34;指针（mData）用于引用Vector数据的设备端副本。
除了将主机数据复制到设备数据＆＃34;内部＆＃34;对于对象，我们必须使对象本身在设备上可用。为此，我们通过指针将对象本身复制到设备端副本（d_num）。
然后我们可以在设备上以通常的方式使用该对象，用于那些具有__device__装饰的方法。
完成内核后，我们必须更新数据的主机副本并切换我们的＆＃34;引用＆＃34;指针返回主机数据。为此目的提供了to_host()方法。
此后，可以在主机代码中再次使用该对象，反映内核中发生的任何数据更改。

这是一个有效的例子：

$ cat t101.cu
#include <iostream>

#include <cmath>
#include <iostream>
#include <cassert>

template <typename T>
class Vector
{
private:
   T* mData, *hData, *dData;   // data stored in vector

   int mSize;  // size of vector
public:
        Vector(const Vector& otherVector);  // Constructor
        Vector(int size);   // Constructor
        ~Vector();   // Desructor

        __host__ __device__ int GetSize() const; // get size of the vector
        __host__ __device__ T& operator[](int i);  // see element

        // change element i
        __host__ __device__ void set(size_t i, T value) {
                mData[i] = value;
        };

        __host__ __device__ Vector<T>& operator=(const Vector<T>& otherVector);
        void to_device();
        void to_host();
        template <class S>    // output vector
        friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};


// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
   mSize = otherVector.GetSize();
   hData = new T [mSize];
   cudaMalloc(&dData, mSize*sizeof(T));
   mData = hData;
   for (int i=0; i<mSize; i++)
   {
      mData[i] = otherVector.mData[i];
   }
}

// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
   assert(size > 0);
   mSize = size;
   hData = new T [mSize];
   cudaMalloc(&dData, mSize*sizeof(T));
   mData = hData;
   for (int i=0; i<mSize; i++)
   {
      mData[i] = 0.0;
   }
}

// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
   delete[] hData;
   if (dData) cudaFree(dData);
}

// Method to get the size of a vector
template <typename T>
__host__ __device__
int Vector<T>::GetSize() const
{
   return mSize;
}

// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
__host__ __device__
T& Vector<T>::operator[](int i)
{
        assert(i > -1);
        assert(i < mSize);
        return mData[i];
}

// Overloading the assignment operator
template <typename T>
__host__ __device__
Vector<T>& Vector<T>::operator=(const Vector<T>& otherVector)
{
   assert(mSize == otherVector.mSize);
   for (int i=0; i<mSize; i++)
   {
      mData[i] = otherVector.mData[i];
   }
   return *this;
}

// Overloading the insertion << operator
// not callable on the device!
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
   for (int i=0; i<v.mSize; i++) {
      output << v[i] << "   ";
   }
  return output;
}

template <typename T>
void Vector<T>::to_device(){
  cudaMemcpy(dData, hData, mSize*sizeof(T), cudaMemcpyHostToDevice);
  mData = dData;
}

template <typename T>
void Vector<T>::to_host(){
  cudaMemcpy(hData, dData, mSize*sizeof(T), cudaMemcpyDeviceToHost);
  mData = hData;
}

__global__ void alpha(Vector<int> *d_num)
{


        d_num->set(0,100);
        d_num->set(2,11);
        (*d_num)[1] = 50;
}


int main()
{
        Vector<int> num(10);

        for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9

        std::cout << "Size of vector: " << num.GetSize() << "\n";
        std::cout << num << "\n"; // print vector

        Vector<int> *d_num;
        cudaMalloc(&d_num, sizeof(Vector<int>));

        num.to_device();
        cudaMemcpy(d_num, &(num), sizeof(Vector<int>), cudaMemcpyHostToDevice);
        // launch the kernel
        alpha<<<1,1>>>(d_num);


        // copy the modified array back to the host, overwriting the contents of h_arr
        num.to_host();

        std::cout << num << "\n";


        // free GPU memory allocation and exit

        return 0;
}
$ nvcc -arch=sm_61 -o t101 t101.cu
$ cuda-memcheck ./t101
========= CUDA-MEMCHECK
Size of vector: 10
0   1   2   3   4   5   6   7   8   9
100   50   11   3   4   5   6   7   8   9
========= ERROR SUMMARY: 0 errors
$

注意：

根据我的测试，您发布的代码存在各种编译错误，因此我必须对您的Vector类进行其他更改才能将其编译。
按值将对象传递给内核将调用复制构造函数，然后调用析构函数，这会使事情变得更加困难，因此我选择通过指针传递对象（这是你原来的方式），以避免这种情况。
您的内核调用正在启动100个线程。由于他们都做了完全相同的事情，没有任何阅读活动，这没有什么特别的错误，但我已经把它改成了一个单一的线程。它仍然表现出相同的能力。

Answer 2

不仅cudaMemcpyDeviceToHost部分您遇到了麻烦。

Vector<int> num(10);
Vector<int>* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));

这将在cuda全局内存上分配40个字节（假设sizeof(int)为4），由d_num类型Vector<int>*指向。我不认为你期望Vector<int>对象本身是40个字节。

让我们尝试另一种方式。

cudaMalloc(&d_num, sizeof(Vector<int>));
cudaMalloc(&d_num->mData, num.GetSize()*sizeof(int)); // assume mData is a public attribute

不幸的是，第二行将发出segmentation fault，因为您正在从主机代码（d_num->mData）访问设备内存。

因此，Vector类的实现有很多谬误。如果您计划使用固定大小的数组，只需将d_num声明为指针即可。

int* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// .. some kernel operations
cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

Answer 3

Thrust是为CUDA编写的库，它有向量。 http://docs.nvidia.com/cuda/thrust/ 也许它具有你需要的所有功能，所以如果你不需要，为什么要重新发明轮子。

处理向量 - cudaMemcpyDeviceToHost

3 个答案: