Question

我正在解决在CUDA中对矩阵行进行求和的问题。我给出了以下例子。

假设有以下20 * 4数组：

将2d数组压平为1d数组（以行主或主列顺序）后，我需要将每个线程分配给不同的行并计算该行的开销。

例如
- 主题1应计算1 2 3 4的费用 - 主题2应计算4 1 2 3

的费用

我在CUDA怎么办？

谢谢大家的回复

Answer 1

#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256

__global__ void mykernel(int *costdata, int rows, int cols, int *results){
  int tidx = threadIdx.x + blockDim.x*blockIdx.x;
  if (tidx < rows){
    int mycost = 0;
    for (int i = 0; i < cols; i++)
       mycost += costdata[(tidx*cols)+i];
    results[tidx] = mycost;
    }
  }

int main(){
  //define and initialize host and device storage for cost and results
  int *d_costdata, *h_costdata, *d_results, *h_results;
  h_results = (int *)malloc(MROWS*sizeof(int));
  h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
  for (int i=0; i<(MROWS*NCOLS); i++)
    h_costdata[i] = rand()%4;
  cudaMalloc((void **)&d_results, MROWS*sizeof(int));
  cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
  //copy cost data from host to device
  cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
  mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
  // copy results back from device to host
  cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i=0; i<MROWS; i++){
    int loc_cost = 0;
    for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
    printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
    }
  }

这假设每行的“成本”只是每行中元素的总和。如果您有不同的“成本”功能，则可以相应地修改内核for循环中的活动。这也假定C风格的行主要数据存储（1 2 3 4 4 1 2 3 3 4 1 2等）

如果您改为使用列主存储（1 4 3等），则可以略微提高性能，因为数据读取可以完全合并。然后你的内核代码看起来像这样：

for (int i = 0; i < cols; i++)
  mycost += costdata[(i*rows)+tidx];

您还应该对所有CUDA API调用和内核调用使用proper cuda error checking。

编辑：正如下面的评论中所讨论的，对于行主存储情况，在某些情况下，它可能通过选择加载16字节数量而不是基数来提高内存效率类型。以下是一个修改版本，它为任意维度和（或多或少）任意基类型实现了这个想法：

#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>

#define MROWS 1742
#define NCOLS 801
#define nTPB 256

typedef double mytype;

__host__ int sizetype(){
  int size = 0;
  if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
      size = 4;
  else if (typeid(mytype) == typeid(double))
      size = 8;
  else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
      size = 1;
  return size;
  }


template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
  int chunk = 16/size;  // assumes size is a factor of 16
  int tidx = threadIdx.x + blockDim.x*blockIdx.x;
  if (tidx < rows){
    T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
    T mycost = (T)0;
    int count = 0;
    while (count < cols){
      if ((cols-count)>=chunk){
      // read 16 bytes
        int4 temp = *((int4 *)(myrowptr + count));
        int bcount = 16;
        int j = 0;
        while (bcount > 0){
          mycost += *(((T *)(&temp)) + j++);
          bcount -= size;
          count++;}
        }
      else {
      // read one quantity at a time
        for (; count < cols; count++)
          mycost += myrowptr[count];
        }
    results[tidx] = mycost;
    }
  }
}

int main(){
  int typesize = sizetype();
  if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
  //define and initialize host and device storage for cost and results
  mytype *d_costdata, *h_costdata, *d_results, *h_results;
  h_results = (mytype *)malloc(MROWS*sizeof(mytype));
  h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
  for (int i=0; i<(MROWS*NCOLS); i++)
    h_costdata[i] = (mytype)(rand()%4);
  size_t pitch = 0;
  cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
  cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
  //copy cost data from host to device
  cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype),  MROWS, cudaMemcpyHostToDevice);

  mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
  // copy results back from device to host
  cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
  for (int i=0; i<MROWS; i++){
    mytype loc_cost = (mytype)0;
    for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
    if ((i < 10) && (typesize > 1))
      std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
    if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
    }
  std::cout << "Results are correct!" << std::endl;
  }

在CUDA中对矩阵的行（以行主或主列顺序存储）求和

1 个答案: