Question

我有一个2d类型的问题，我解决了实现内核。由于问题是2d，如果我可以使用列主矩阵排序d_A[i][j]而不是d_A[i + m*j]，那么内核中的可读性会更高。如果我只使用d_A[i][j]，访问权限cudaMalloc将无效。我必须使用什么功能？我最欣赏一个例子。在C ++中，这可以通过分配2d内存来实现。 double** A = new double[10][10];

与cudaMallocPitch有任何关系吗？或者音高版本仅用于最大化2d对齐和合并的存储器访问？

Answer 1

您可以先定义一个支持条带的矢量类，然后2d矩阵的operator []可以返回一个条带设置正确的矢量。实际上将从向量中调用第二个[]。这是一个例子：

#define _devhost_ __device__ __host__
typedef long SizeT;

template<typename T>
_devhost_ const T* pointer_offset(const T* ptr, SizeT offset) {
  return reinterpret_cast<const T*>(
      reinterpret_cast<const uint8_t*>(ptr) + offset);
}

typedef enum {
  NonConst = 0,
  Const = 1,
} ConstEnum;

typedef enum {
  NonOwner = 0,
  Owner = 1,
} OwnerEnum;

// Strip is measured in the number of bytes.
typedef enum {
  NonStrip = 0,
  Strip = 1,
} StripEnum;

template<
  typename ValueType, typename Alloc,
  ConstEnum IsConst = NonConst,
  OwnerEnum IsOwner = NonOwner,
  StripEnum HasStrip = NonStrip
> class Vector;

template<
  typename ValueType, typename Alloc,
  ConstEnum IsConst = NonConst,
  OwnerEnum IsOwner = NonOwner
> class DenseMatrix;

template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const> {
protected:
  ValueType* ptr_;
  SizeT len_;

public:
  _devhost_ Vector():ptr_(0), len_(0) {}
  _devhost_ Vector(const ValueType* ptr, SizeT len) {
    ptr_ = const_cast<ValueType*>(ptr);
    len_ = len;
  }

  _devhost_ const ValueType& operator[] (SizeT i) const {
    return ptr_[i];
  }
  _devhost_ SizeT size() const {return len_;}
  _devhost_ const ValueType* data() const {return ptr_;}
};

template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const, NonOwner, Strip>:
  public Vector<ValueType, Alloc, Const> {

protected:
  SizeT strip_;
  typedef Vector<ValueType, Alloc, Const> Base;

  // C++ independent names lookup will not look into base classes which
  // are depended on template arguments. A "using" is required here.
  using Base::ptr_;
  using Base::len_;

public:
  _devhost_ Vector():strip_(sizeof(ValueType)) {}
  _devhost_ Vector(const ValueType* ptr, SizeT len,
      SizeT strip = sizeof(ValueType)):Base(ptr, len), strip_(strip) {}

  _devhost_ const ValueType& operator[] (SizeT i) const {
    return *pointer_offset(ptr_, i * strip_);
  }

  // NOTE: size() and data() still valid,
  // but may not make the right sense here in the presence of stripe.
};

template<typename ValueType, typename Alloc>
class DenseMatrix<ValueType, Alloc, Const> {
protected:
  ValueType* vals_;
  SizeT nrows_, ncols_;

public:
  _devhost_ DenseMatrix() {vals_ = 0; nrows_ = 0; ncols_ = 0;}
  _devhost_ DenseMatrix(const ValueType* vals, SizeT n_rows, SizeT n_cols) {
    nrows_ = n_rows; ncols_ = n_cols;
    vals_ = const_cast<ValueType*>(vals_);
  }

  _devhost_ SizeT num_rows() const {return nrows_;}
  _devhost_ SizeT num_cols() const {return ncols_;}
  _devhost_ SizeT numel() const {return nrows_ * ncols_;}

  _devhost_ const ValueType* data() const {return vals_;}
  _devhost_ const ValueType& at(SizeT irow, SizeT icol) const {
    return vals_[irow + icol * nrows_];
  }

  typedef Vector<ValueType, Alloc, Const, NonOwner, Strip> ConstIndexer;

  _devhost_ ConstIndexer operator[] (SizeT irow) const {
    return ConstIndexer(vals_ + irow, ncols_, nrows_ * sizeof(ValueType));
  }

  _devhost_ DenseMatrix<ValueType, Alloc, Const> get_cols(SizeT icol,
      SizeT n_cols) const {
    return DenseMatrix<ValueType, Alloc, Const>(vals_ + icol * nrows_,
        nrows_, n_cols);
  }

  _devhost_ Vector<ValueType, Alloc, Const> get_col(SizeT icol) const {
    return Vector<ValueType, Alloc, Const>(vals_ + icol * nrows_, nrows_);
  }
};

Answer 2

如果你的顾虑是可读性，我会简单地使用一个define语句

#define A(i,j) d_A[i + m*j]

如何2d访问CUDA内核中的内存？例如D_A [i] [j]

2 个答案: