我有一个2d类型的问题,我解决了实现内核。由于问题是2d,如果我可以使用列主矩阵排序d_A[i][j]
而不是d_A[i + m*j]
,那么内核中的可读性会更高。如果我只使用d_A[i][j]
,访问权限cudaMalloc
将无效。我必须使用什么功能?我最欣赏一个例子。在C ++中,这可以通过分配2d内存来实现。 double** A = new double[10][10];
与cudaMallocPitch
有任何关系吗?或者音高版本仅用于最大化2d对齐和合并的存储器访问?
答案 0 :(得分:3)
您可以先定义一个支持条带的矢量类,然后2d矩阵的operator []可以返回一个条带设置正确的矢量。实际上将从向量中调用第二个[]。这是一个例子:
#define _devhost_ __device__ __host__
typedef long SizeT;
template<typename T>
_devhost_ const T* pointer_offset(const T* ptr, SizeT offset) {
return reinterpret_cast<const T*>(
reinterpret_cast<const uint8_t*>(ptr) + offset);
}
typedef enum {
NonConst = 0,
Const = 1,
} ConstEnum;
typedef enum {
NonOwner = 0,
Owner = 1,
} OwnerEnum;
// Strip is measured in the number of bytes.
typedef enum {
NonStrip = 0,
Strip = 1,
} StripEnum;
template<
typename ValueType, typename Alloc,
ConstEnum IsConst = NonConst,
OwnerEnum IsOwner = NonOwner,
StripEnum HasStrip = NonStrip
> class Vector;
template<
typename ValueType, typename Alloc,
ConstEnum IsConst = NonConst,
OwnerEnum IsOwner = NonOwner
> class DenseMatrix;
template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const> {
protected:
ValueType* ptr_;
SizeT len_;
public:
_devhost_ Vector():ptr_(0), len_(0) {}
_devhost_ Vector(const ValueType* ptr, SizeT len) {
ptr_ = const_cast<ValueType*>(ptr);
len_ = len;
}
_devhost_ const ValueType& operator[] (SizeT i) const {
return ptr_[i];
}
_devhost_ SizeT size() const {return len_;}
_devhost_ const ValueType* data() const {return ptr_;}
};
template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const, NonOwner, Strip>:
public Vector<ValueType, Alloc, Const> {
protected:
SizeT strip_;
typedef Vector<ValueType, Alloc, Const> Base;
// C++ independent names lookup will not look into base classes which
// are depended on template arguments. A "using" is required here.
using Base::ptr_;
using Base::len_;
public:
_devhost_ Vector():strip_(sizeof(ValueType)) {}
_devhost_ Vector(const ValueType* ptr, SizeT len,
SizeT strip = sizeof(ValueType)):Base(ptr, len), strip_(strip) {}
_devhost_ const ValueType& operator[] (SizeT i) const {
return *pointer_offset(ptr_, i * strip_);
}
// NOTE: size() and data() still valid,
// but may not make the right sense here in the presence of stripe.
};
template<typename ValueType, typename Alloc>
class DenseMatrix<ValueType, Alloc, Const> {
protected:
ValueType* vals_;
SizeT nrows_, ncols_;
public:
_devhost_ DenseMatrix() {vals_ = 0; nrows_ = 0; ncols_ = 0;}
_devhost_ DenseMatrix(const ValueType* vals, SizeT n_rows, SizeT n_cols) {
nrows_ = n_rows; ncols_ = n_cols;
vals_ = const_cast<ValueType*>(vals_);
}
_devhost_ SizeT num_rows() const {return nrows_;}
_devhost_ SizeT num_cols() const {return ncols_;}
_devhost_ SizeT numel() const {return nrows_ * ncols_;}
_devhost_ const ValueType* data() const {return vals_;}
_devhost_ const ValueType& at(SizeT irow, SizeT icol) const {
return vals_[irow + icol * nrows_];
}
typedef Vector<ValueType, Alloc, Const, NonOwner, Strip> ConstIndexer;
_devhost_ ConstIndexer operator[] (SizeT irow) const {
return ConstIndexer(vals_ + irow, ncols_, nrows_ * sizeof(ValueType));
}
_devhost_ DenseMatrix<ValueType, Alloc, Const> get_cols(SizeT icol,
SizeT n_cols) const {
return DenseMatrix<ValueType, Alloc, Const>(vals_ + icol * nrows_,
nrows_, n_cols);
}
_devhost_ Vector<ValueType, Alloc, Const> get_col(SizeT icol) const {
return Vector<ValueType, Alloc, Const>(vals_ + icol * nrows_, nrows_);
}
};
答案 1 :(得分:1)
如果你的顾虑是可读性,我会简单地使用一个define语句
#define A(i,j) d_A[i + m*j]