如何在CUDA中使用std :: vector并不明显,所以我设计了自己的Vector类:
#ifndef VECTORHEADERDEF
#define VECTORHEADERDEF
#include <cmath>
#include <iostream>
#include <cassert>
template <typename T>
class Vector
{
private:
T* mData; // data stored in vector
int mSize; // size of vector
public:
Vector(const Vector& otherVector); // Constructor
Vector(int size); // Constructor
~Vector(); // Desructor
__host__ __device__ int GetSize() const; // get size of the vector
T& operator[](int i); // see element
// change element i
__host__ __device__ void set(size_t i, T value) {
mData[i] = value;
}
template <class S> // output vector
friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};
// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
mSize = otherVector.GetSize();
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
}
// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
assert(size > 0);
mSize = size;
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = 0.0;
}
}
// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
delete[] mData;
}
// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
{
return mSize;
}
// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
{
assert(i > -1);
assert(i < mSize);
return mData[i];
}
// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
{
assert(mSize == otherVector.mSize);
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
return *this;
}
// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
for (int i=0; i<v.mSize; i++) {
output << v[i] << " ";
}
return output;
}
我的主要功能 - 我只是将一个向量传递给设备,修改它并将其传回 - 如下(内核设计仅用于测试目的):
#include <iostream>
#include "Vector.hpp"
__global__ void alpha(Vector<int>* d_num)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
d_num->set(0,100);
d_num->set(2,11);
}
int main()
{
Vector<int> num(10);
for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
std::cout << "Size of vector: " << num.GetSize() << "\n";
std::cout << num << "\n"; // print vector
Vector<int>* d_num;
// allocate global memory on the device
cudaMalloc((void **) &d_num, num.GetSize()*sizeof(int));
// copy data from host memory to the device memory
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// launch the kernel
alpha<<<1,100>>>(d_num);
// copy the modified array back to the host, overwriting the contents of h_arr
cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
std::cout << num << "\n";
// free GPU memory allocation and exit
cudaFree(d_num);
return 0;
}
我遇到的问题是cudaMemcpyDeviceToHost。从输出中可以看出,它并没有真正将设备向量复制到num向量。
我该如何处理? (请明确指出,我对CUDA很新。)
答案 0 :(得分:1)
这将创建一个指向向量num
的第一个元素的有效指针:
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
^^^^^^^
这不会:
cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
^^^
您的Vector
对象的名称不是指向其第一个数据元素的指针。相反,您应该以与您编写的第一个类似的方式编写该行,如下所示:
cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
然而,这本身并不是一个解决方法。请注意d_num
不是Vector
,但已经是指针,因此我们可以直接在这些操作中使用它。虽然使用&(d_num[0])
没有错,但没有必要这样做。
由于d_num
不是Vector
(因为您已经分配了它 - 它是指向一组int
数量的裸指针),因此您对Vector
方法的使用在内核中也被打破了。如果要在内核中使用Vector
方法,则需要将实际的Vector
对象传递给它,而不仅仅是数据。由于传递对象将需要对象内的设备数据处理(主机上可访问的数据无法在设备上访问,反之亦然),因此它是对Vector
类的广泛重写。我已经做了有限的尝试,展示了一种可能的前进方向。基本方法(即一种可能的方法)如下:
to_device()
方法用于此目的。这种方法也可以切换我们的参考&#34;指针(mData
)用于引用Vector
数据的设备端副本。d_num
)。__device__
装饰的方法。to_host()
方法。这是一个有效的例子:
$ cat t101.cu
#include <iostream>
#include <cmath>
#include <iostream>
#include <cassert>
template <typename T>
class Vector
{
private:
T* mData, *hData, *dData; // data stored in vector
int mSize; // size of vector
public:
Vector(const Vector& otherVector); // Constructor
Vector(int size); // Constructor
~Vector(); // Desructor
__host__ __device__ int GetSize() const; // get size of the vector
__host__ __device__ T& operator[](int i); // see element
// change element i
__host__ __device__ void set(size_t i, T value) {
mData[i] = value;
};
__host__ __device__ Vector<T>& operator=(const Vector<T>& otherVector);
void to_device();
void to_host();
template <class S> // output vector
friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};
// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
mSize = otherVector.GetSize();
hData = new T [mSize];
cudaMalloc(&dData, mSize*sizeof(T));
mData = hData;
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
}
// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
assert(size > 0);
mSize = size;
hData = new T [mSize];
cudaMalloc(&dData, mSize*sizeof(T));
mData = hData;
for (int i=0; i<mSize; i++)
{
mData[i] = 0.0;
}
}
// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
delete[] hData;
if (dData) cudaFree(dData);
}
// Method to get the size of a vector
template <typename T>
__host__ __device__
int Vector<T>::GetSize() const
{
return mSize;
}
// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
__host__ __device__
T& Vector<T>::operator[](int i)
{
assert(i > -1);
assert(i < mSize);
return mData[i];
}
// Overloading the assignment operator
template <typename T>
__host__ __device__
Vector<T>& Vector<T>::operator=(const Vector<T>& otherVector)
{
assert(mSize == otherVector.mSize);
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
return *this;
}
// Overloading the insertion << operator
// not callable on the device!
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
for (int i=0; i<v.mSize; i++) {
output << v[i] << " ";
}
return output;
}
template <typename T>
void Vector<T>::to_device(){
cudaMemcpy(dData, hData, mSize*sizeof(T), cudaMemcpyHostToDevice);
mData = dData;
}
template <typename T>
void Vector<T>::to_host(){
cudaMemcpy(hData, dData, mSize*sizeof(T), cudaMemcpyDeviceToHost);
mData = hData;
}
__global__ void alpha(Vector<int> *d_num)
{
d_num->set(0,100);
d_num->set(2,11);
(*d_num)[1] = 50;
}
int main()
{
Vector<int> num(10);
for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
std::cout << "Size of vector: " << num.GetSize() << "\n";
std::cout << num << "\n"; // print vector
Vector<int> *d_num;
cudaMalloc(&d_num, sizeof(Vector<int>));
num.to_device();
cudaMemcpy(d_num, &(num), sizeof(Vector<int>), cudaMemcpyHostToDevice);
// launch the kernel
alpha<<<1,1>>>(d_num);
// copy the modified array back to the host, overwriting the contents of h_arr
num.to_host();
std::cout << num << "\n";
// free GPU memory allocation and exit
return 0;
}
$ nvcc -arch=sm_61 -o t101 t101.cu
$ cuda-memcheck ./t101
========= CUDA-MEMCHECK
Size of vector: 10
0 1 2 3 4 5 6 7 8 9
100 50 11 3 4 5 6 7 8 9
========= ERROR SUMMARY: 0 errors
$
注意:
根据我的测试,您发布的代码存在各种编译错误,因此我必须对您的Vector
类进行其他更改才能将其编译。
按值将对象传递给内核将调用复制构造函数,然后调用析构函数,这会使事情变得更加困难,因此我选择通过指针传递对象(这是你原来的方式) ,以避免这种情况。
您的内核调用正在启动100个线程。由于他们都做了完全相同的事情,没有任何阅读活动,这没有什么特别的错误,但我已经把它改成了一个单一的线程。它仍然表现出相同的能力。
答案 1 :(得分:1)
不仅cudaMemcpyDeviceToHost
部分您遇到了麻烦。
Vector<int> num(10);
Vector<int>* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));
这将在cuda全局内存上分配40个字节(假设sizeof(int)
为4),由d_num
类型Vector<int>*
指向。我不认为你期望Vector<int>
对象本身是40个字节。
让我们尝试另一种方式。
cudaMalloc(&d_num, sizeof(Vector<int>));
cudaMalloc(&d_num->mData, num.GetSize()*sizeof(int)); // assume mData is a public attribute
不幸的是,第二行将发出segmentation fault
,因为您正在从主机代码(d_num->mData
)访问设备内存。
因此,Vector类的实现有很多谬误。如果您计划使用固定大小的数组,只需将d_num声明为指针即可。
int* d_num;
cudaMalloc(&d_num, num.GetSize()*sizeof(int));
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// .. some kernel operations
cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
答案 2 :(得分:0)
Thrust是为CUDA编写的库,它有向量。 http://docs.nvidia.com/cuda/thrust/ 也许它具有你需要的所有功能,所以如果你不需要,为什么要重新发明轮子。