#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>

__global__ void chebyprod(int n, float *a, float *b, float *c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   float sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[j-i];
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      c[i] = 0.5f*sum;
   if (i < n)
      c[i] = a[i] + b[i];

int main(void){
  clock_t tStart = clock();
  int N = 10000;
  float *a, *b, *c, *d_a, *d_b, *d_c;
  a = (float*)malloc(N*sizeof(float));
  b = (float*)malloc(N*sizeof(float));
  c = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_a, N*sizeof(float)); 
  cudaMalloc(&d_b, N*sizeof(float));
  cudaMalloc(&d_c, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;

  cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);

  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

  cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);

  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";


1 个答案:

答案 0 :(得分:2)


  1. 公开足够的并行性(大致:创建足够的线程)
  2. 有效利用内存(大致:为了全局内存访问,争取合并)

关于第1项,问题中提供的代码的有效性将取决于GPU。根据经验,我们希望在运行的GPU中为每个SM至少启动2048个线程(在Turing上为1024),从而有机会使GPU“饱和”。对于N = 10000,我们可以使具有5个SM的GPU饱和。对于拥有80个SM的Tesla V100,我们没有希望将具有10,000个线程的GPU饱和。


我们能否提出一个可能在这两个方面都得到改善的替代实现?我们将考虑线程策略的以下更改:每个输出点分配一个线程块,而不是每个输出点分配一个线程。每个输出点所需的计算将可视化为矩阵的一个“行”。线程块将沿着该行“跨越”,执行所需的计算,并最终进行线程块级别的缩减以产生该行的单个结果。这将使我们能够解决这两个问题:经线中的相邻线程将能够从ab中读取相邻值,并且我们也将立即能够将线程总数增加一个。系数高达1024(因此,从一万个线程开始,我们最多可以增加一千万个线程。一千万个足以饱和任何当前的CUDA GPU)。该线程策略还具有另一个不错的功能:上述计算的“行”具有不同的长度。第一行和最后一行将是最长的,具有大约N个计算元素,而中间的行将更接近N/2个计算元素。通过选择跨步循环(概念上类似于grid-stride loop),我们可以有效地处理变化的行长度。每个线程块仅在需要时才沿行“跨步”,从而累积结果。


$ cat
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      c[i] = 0.5f*sum;
// assume one threadblock per c_k coefficient
// assume a power-of-2 threadblock size
const int tpb_p2 = 8;
const int nTPB = 1<<tpb_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);

__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
#ifndef NO_WS
  __shared__ mt sd[32];
  if (threadIdx.x < 32) sd[threadIdx.x] = 0;
  __shared__ mt sd[nTPB];
  int k = blockIdx.x;
  mt sum = 0.0f;
  int row_width = (((k)>(n-k))?(k):(n-k))+1;
  int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0);
  int j = threadIdx.x;
  mt tmp_a;
  for (int s=0; s < strides; s++){ // block-stride loop
    if (j < n) tmp_a = a[j];
    if (j <= k) sum += tmp_a*b[k-j];
    if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
    j += nTPB;
#ifndef NO_WS
  // 1st warp-shuffle reduction
  int lane = threadIdx.x & (warpSize-1);
  int warpID = threadIdx.x >> 5; // assumes warpSize == 32
  unsigned mask = 0xFFFFFFFFU;
  for (int offset = warpSize>>1; offset > 0; offset >>= 1)
    sum += __shfl_down_sync(mask, sum, offset);
  if (lane == 0) sd[warpID] = sum;
  __syncthreads(); // put warp results in shared mem
  // hereafter, just warp 0
  if (warpID == 0){
  // reload val from shared mem if warp existed
    sum = sd[lane];
  // final warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(mask, sum, offset);
  sd[threadIdx.x] = sum;
  for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction
    if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];}
  if (!threadIdx.x) sum = sd[0];
  if (!threadIdx.x) c[k] = sum*0.5f;

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
$ nvcc -arch=sm_52 -o t1497
$ ./t1497
blockSize: 1024
gridSize: 10
Time taken: 0.001687s
no error
Vector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]
Time taken: 0.000350s
no error
Vector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]
Max error = 0.0137787


以上示例显示,修改后的算法运行速度提高了约5倍(在Tesla V100上)。尽管存在数值差异,但这是由于浮点问题引起的。为了证明算法给出了正确的结果,请将typedeffloat切换到double。您将看到结果基本上不再存在任何数值差异(建议算法在逻辑上是相同的),并且float分辨率的改进算法版本为数值上的前10个元素提供了答案更接近于double算法产生的“更准确”的结果。

如评论所述,此算法转换可能并非在每种情况下都是有益的。主要好处将来自开发具有更大线程容量(大于N线程)的GPU。相对较小的GPU(例如,对于N = 10000,可能为8个SM或更少)可能无法从中受益,并且实际上代码的运行速度可能会比原始算法慢。

尽管我提到了合并,但对于N = 10000,此处的输入数据非常小(〜80K字节),将适合大多数GPU的L2缓存。一旦数据位于L2缓存中,低效的访问模式就不再是问题。因此,在这种情况下,该算法的主要好处可能是由于第1项。如果无法利用第1项,则该算法几乎没有收益。


#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      c[i] = 0.5f*sum;
// assume one warp per c_k coefficient
// assume a multiple-of-32 threadblock size
const int nTPB = 32*8;
const int warpSize_p2 = 5; // assumes warpSize == 32
const int nWarps = nTPB>>warpSize_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
  int warpID = threadIdx.x >> warpSize_p2;
  int k = blockIdx.x*(nWarps)+warpID;
  if (k < n){
    mt sum = 0.0f;
    int lane = threadIdx.x & (warpSize-1);
    int row_width = (((k)>(n-k))?(k):(n-k))+1;
    int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0);
    int j = lane;
    mt tmp_a;
    for (int s=0; s < strides; s++){ // warp-stride loop
      if (j < n) tmp_a = a[j];
      if (j <= k) sum += tmp_a*b[k-j];
      if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
      j += warpSize;
  // warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
    if (lane==0) c[k] = sum*0.5f;}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;