Question

我对CUDA编程比较陌生。我已经理解了编程模型，并且已经编写了很少的基本内核。我知道如何将内核应用于矩阵的每个元素（存储为1D数组），但现在我试图找出如何将相同的操作应用于输入矩阵的同一行/列即可。

我们说我有一个MxN矩阵和一个长度为N的向量。我想对矩阵的每一行求和（但可以是任何其他数学运算）。这种操作的序列代码是：

for (int c = 0; c < columns; c++) 
{
    for (int r = 0; r < rows; r++)
    {
        M[r * rows + c] += V[c];
    }
}

现在用于执行此操作的CUDA代码应该非常简单：我应该生成与元素一样多的cuda线程并应用此内核：

__global__ void kernel(const unsigned int size, float* matrix, const float* vector)
{
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size)
    {
        // sum the current element with the 
        matrix[idx] += vector[threadIdx.x];
    }
}

它运行但结果不正确。实际上，如果我在内核完成其工作后转置矩阵，它是正确的。不幸的是，我不知道为什么它以这种方式工作。你能帮我弄清楚这个问题吗？提前谢谢。

编辑＃1

我使用以下命令启动内核：

int block_size = 64;
int grid_size = (M * N + block_size - 1) / block_size;
kernel<<<grid_size, block_size>>>(M * N, matrix, vector);

编辑＃2

我通过修复@RobertCrovella建议的CPU代码来解决问题：

M[r * columns + c] += V[c];

它应与外部for匹配，即匹配列。

Answer 1

问题中显示的内核可以不经修改地使用，以将向量与矩阵的每一行相加（假设为c样式的行主存储），受到某些限制。示范是here。

该方法的主要限制是可以处理的最大向量长度和因此矩阵宽度等于每个块的最大线程数，在当前支持CUDA 7的GPU上为1024.

我们可以通过稍微修改矢量索引，并将行宽（列数）作为参数传递给矩阵来消除该限制。通过这种修改，我们应该能够处理任意矩阵（和矢量）大小。

编辑：根据讨论/评论，OP想知道如何处理行主要或列主要底层存储。以下示例使用模板化内核来选择行主要或列主要底层存储，并且还显示了一种可能的CUBLAS方法，用于使用rank-1 update function执行向每个矩阵行添加操作：

$ cat t712.cu
#include <iostream>
#include <cublas_v2.h>

#define ROWS 20
#define COLS 10

#define nTPB 64

#define ROW_MAJOR 0
#define COL_MAJOR 1

template <int select, typename T>
__global__ void vec_mat_row_add(const unsigned int height, const unsigned int width, T* matrix, const T* vector)
{
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < height*width)
    {
        // sum the current element with the
    if (select == ROW_MAJOR)
        matrix[idx] += vector[idx%width];
    else // COL_MAJOR
        matrix[idx] += vector[idx/height];
    }
}

int main(){

  float *h_mat, *d_mat, *h_vec, *d_vec;
  const unsigned int msz = ROWS*COLS*sizeof(float);
  const unsigned int vsz = COLS*sizeof(float);
  h_mat = (float *)malloc(msz);
  h_vec = (float *)malloc(vsz);
  cudaMalloc(&d_mat, msz);
  cudaMalloc(&d_vec, vsz);
  for (int i=0; i<COLS; i++) h_vec[i] = i; // set vector to 0,1,2, ...
  cudaMemcpy(d_vec, h_vec, vsz, cudaMemcpyHostToDevice);
  // test row-major case
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  vec_mat_row_add<ROW_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "Row-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[i*COLS+j] << " ";
    std::cout << std::endl;}
  // test column-major case
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  vec_mat_row_add<COL_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "Column-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
    std::cout << std::endl;}
  // test CUBLAS, doing matrix-vector add using <T>ger
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  float *d_ones, *h_ones;
  h_ones = (float *)malloc(ROWS*sizeof(float));
  for (int i =0; i<ROWS; i++) h_ones[i] = 1.0f;
  cudaMalloc(&d_ones, ROWS*sizeof(float));
  cudaMemcpy(d_ones, h_ones, ROWS*sizeof(float), cudaMemcpyHostToDevice);
  cublasHandle_t ch;
  cublasCreate(&ch);
  float alpha = 1.0f;
  cublasStatus_t stat = cublasSger(ch, ROWS, COLS, &alpha, d_ones, 1, d_vec, 1, d_mat, ROWS);
  if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "CUBLAS Column-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
    std::cout << std::endl;}

  return 0;
}
$ nvcc -o t712 t712.cu -lcublas
$ ./t712
Row-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
CUBLAS Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
$

为简洁起见，我还没有包含proper cuda error checking，但每次遇到CUDA代码时，这都是个好主意。作为代理/快捷方式，您可以使用cuda-memcheck运行代码作为快速检查以查看是否存在任何CUDA错误。

请注意，我们希望所有3个打印输出都相同，因为这实际上是显示矩阵的正确方法，无论底层存储是行主要还是列主要。在处理显示输出的for循环中考虑了底层存储的差异。

Answer 2

Robert Crovella已经回答了这个问题，提供了使用显式CUDA内核和cuBLAS的示例。

我发现，对于将来的参考，我还发现了一个有关如何使用CUDA Thrust执行逐行或逐列操作的示例。特别是，我关注两个问题：

将列向量求和到所有矩阵列;
将行向量求和到所有矩阵行。

thrust::transform的一般性能够将下面的例子概括为除和之外的元素运算（例如，乘法，除法，减法等）。

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/random.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/equal.h>

using namespace thrust::placeholders;

/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {

    T Ncols; // --- Number of columns

    __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}

    __host__ __device__ T operator()(T i) { return i / Ncols; }
};

/********/
/* MAIN */
/********/
int main()
{
    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/

    const int Nrows = 10;           // --- Number of rows
    const int Ncols =  3;           // --- Number of columns  

    // --- Random uniform integer distribution between 0 and 100
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist1(0, 100);

    // --- Random uniform integer distribution between 1 and 4
    thrust::uniform_int_distribution<int> dist2(1, 4);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_matrix(Nrows * Ncols);
    for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist1(rng);

    // --- Column vector allocation and initialization
    thrust::device_vector<float> d_column(Nrows);
    for (size_t i = 0; i < d_column.size(); i++) d_column[i] = (float)dist2(rng);

    // --- Row vector allocation and initialization
    thrust::device_vector<float> d_row(Ncols);
    for (size_t i = 0; i < d_row.size(); i++) d_row[i] = (float)dist2(rng);

    printf("\n\nOriginal matrix\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    printf("\n\nColumn vector\n");
    for(int i = 0; i < Nrows; i++) std::cout << d_column[i] << "\n";

    printf("\n\nRow vector\n");
    for(int i = 0; i < Ncols; i++) std::cout << d_row[i] << " ";

    /*******************************************************/
    /* ADDING THE SAME COLUMN VECTOR TO ALL MATRIX COLUMNS */
    /*******************************************************/

    thrust::device_vector<float> d_matrix2(d_matrix);

    thrust::transform(d_matrix.begin(), d_matrix.end(),
                      thrust::make_permutation_iterator(
                                d_column.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols))),
                      d_matrix2.begin(),
                      thrust::plus<float>());

    printf("\n\nColumn + Matrix -> Result matrix\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix2[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    /*************************************************/
    /* ADDING THE SAME ROW VECTOR TO ALL MATRIX ROWS */
    /*************************************************/

    thrust::device_vector<float> d_matrix3(d_matrix);

    thrust::transform(thrust::make_permutation_iterator(
                                d_matrix.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
                      thrust::make_permutation_iterator(
                                d_matrix.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)) + Nrows * Ncols,                    
                                thrust::make_permutation_iterator(
                                    d_row.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows))),
                      thrust::make_permutation_iterator(
                                d_matrix3.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
                      thrust::plus<float>());


    printf("\n\nRow + Matrix -> Result matrix\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix3[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    return 0; 
}

使用CUDA对矩阵进行逐行/逐列操作

2 个答案: