Cuda-每个向量元素中有多个和

时间:2019-08-25 09:08:46

标签: vector cuda sum reduction

系数为a和b的两个Chebyshev多项式系列的乘积可以用公式表示

enter image description here

问题是要尽可能并行化。

我已经成功地使用cuda通过简单地对每个向量元素应用一个线程来并行化上述公式。因此,一个线程执行求和/乘法。

class YoutubeAdapter(internal var context: Context, internal var youtubevideosList: List<YoutubeModel>, recyclerView: RecyclerView) : RecyclerView.Adapter<YoutubeAdapter.MViewHolder>() {

    internal var recyclerView: RecyclerView? = null
    internal var progressBar:ProgressBar?=null
    init {
        this.recyclerView = recyclerView
        this.progressBar=ProgressBar(context)
    }

    @RequiresApi(Build.VERSION_CODES.O)
    override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): MViewHolder {
        val view = LayoutInflater.from(context).inflate(R.layout.youtube_row, parent, false)

        return MViewHolder(view)
    }

    override fun onBindViewHolder(holder: MViewHolder, position: Int) {


        //val youtubeid = "Wn8-CsL_lg4"

        val youtubeid = youtubevideosList.get(position).getYoutubeID()
        //val youtubelink = "<iframe width=\"100%\" height=\"100%\" src=\"https://www.youtube.com/embed/"+youtubeid+"\" frameborder=\"0\" allowfullscreen></iframe>"
        val youtubelink1 = "<iframe width=\"100%\" height=\"100%\" src=\"https://www.youtube.com/embed/$youtubeid\" frameborder=\"0\" allowfullscreen></iframe>"

        holder.youTubePlayerView.loadData(youtubelink1, "text/html", "utf-8")
        //holder.descTextview.text=youtubevideosList.get(position).getVideoDesc()

       /* holder.youTubePlayerView.webViewClient = object : WebViewClient() {
            override fun onPageStarted(view: WebView, url: String, favicon: Bitmap) {
                super.onPageStarted(view, url, favicon)
                view.visibility =View.INVISIBLE
                progressBar!!.visibility = View.VISIBLE
            }

            override fun onPageFinished(view: WebView, url: String) {
                super.onPageFinished(view, url)
                view.visibility =View.VISIBLE
                progressBar!!.visibility = View.INVISIBLE
            }

        }*/
    }

    override fun getItemCount(): Int {
        return youtubevideosList.size
    }
    @RequiresApi(Build.VERSION_CODES.O)
    inner class MViewHolder(internal var view: View) : RecyclerView.ViewHolder(view) {
        internal var youTubePlayerView: WebView
        internal var descTextview: TextView
        internal var progressBar:ProgressBar?=null
        init {
            youTubePlayerView = view.findViewById<View>(R.id.webVideoView) as WebView
            descTextview = view.findViewById<View>(R.id.describtion_tv) as TextView
           youTubePlayerView.getSettings().setJavaScriptEnabled(true)
            progressBar= ProgressBar(context)
           /* youTubePlayerView.setWebChromeClient(object : WebChromeClient() {
            })*/

        }
    }
}

在另一个代码中,我设法使用总和减少来求和向量中的所有元素(我从nvidia演示中复制的其他人的代码)。现在的问题是,如何结合这两种方法?我想要一堆线程来计算c的每个元素中的所有和/乘。有小费吗?还是我可以学习的类似问题?

矩阵中的行缩减可能类似于该问题。但是我有多个不同长度和乘法的和。

这是nvidia员工提供的代码(我认为)

#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>

__global__ void chebyprod(int n, float *a, float *b, float *c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   float sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[j-i];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
   /*
   if (i < n)
      c[i] = a[i] + b[i];
   */  
}

int main(void){
  clock_t tStart = clock();
  int N = 10000;
  float *a, *b, *c, *d_a, *d_b, *d_c;
  a = (float*)malloc(N*sizeof(float));
  b = (float*)malloc(N*sizeof(float));
  c = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_a, N*sizeof(float)); 
  cudaMalloc(&d_b, N*sizeof(float));
  cudaMalloc(&d_c, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);

  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

  cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);

  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
}

1 个答案:

答案 0 :(得分:2)

问题中提供的代码是实现的明智第一步。线程策略是最常见/典型的策略:每个输出点(此处为N个输出点)分配一个线程。每个线程必须执行计算特定输出点所需的所有计算。改善CUDA代码性能的动机应始终解决至少2个CUDA优化优先级:

  1. 公开足够的并行性(大致:创建足够的线程)
  2. 有效利用内存(大致:为了全局内存访问,争取合并)

关于第1项,问题中提供的代码的有效性将取决于GPU。根据经验,我们希望在运行的GPU中为每个SM至少启动2048个线程(在Turing上为1024),从而有机会使GPU“饱和”。对于N = 10000,我们可以使具有5个SM的GPU饱和。对于拥有80个SM的Tesla V100,我们没有希望将具有10,000个线程的GPU饱和。

关于第2项,所提供的代码在某种程度上也不够完善;在合并时会遇到问题:在许多情况下,相邻线程不会读取内存中的相邻值。仅举一个例子,我看到的第一个全局负载为a[j]。这将为每个线程加载相同的值/位置,而不是在相邻线程中加载相邻的值。

我们能否提出一个可能在这两个方面都得到改善的替代实现?我们将考虑线程策略的以下更改:每个输出点分配一个线程块,而不是每个输出点分配一个线程。每个输出点所需的计算将可视化为矩阵的一个“行”。线程块将沿着该行“跨越”,执行所需的计算,并最终进行线程块级别的缩减以产生该行的单个结果。这将使我们能够解决这两个问题:经线中的相邻线程将能够从ab中读取相邻值,并且我们也将立即能够将线程总数增加一个。系数高达1024(因此,从一万个线程开始,我们最多可以增加一千万个线程。一千万个足以饱和任何当前的CUDA GPU)。该线程策略还具有另一个不错的功能:上述计算的“行”具有不同的长度。第一行和最后一行将是最长的,具有大约N个计算元素,而中间的行将更接近N/2个计算元素。通过选择跨步循环(概念上类似于grid-stride loop),我们可以有效地处理变化的行长度。每个线程块仅在需要时才沿行“跨步”,从而累积结果。

以下是该实现的可行示例:

$ cat t1497.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
}
// assume one threadblock per c_k coefficient
// assume a power-of-2 threadblock size
const int tpb_p2 = 8;
const int nTPB = 1<<tpb_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);

__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
#ifndef NO_WS
  __shared__ mt sd[32];
  if (threadIdx.x < 32) sd[threadIdx.x] = 0;
  __syncthreads();
#else
  __shared__ mt sd[nTPB];
#endif
  int k = blockIdx.x;
  mt sum = 0.0f;
  int row_width = (((k)>(n-k))?(k):(n-k))+1;
  int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0);
  int j = threadIdx.x;
  mt tmp_a;
  for (int s=0; s < strides; s++){ // block-stride loop
    if (j < n) tmp_a = a[j];
    if (j <= k) sum += tmp_a*b[k-j];
    if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
    j += nTPB;
    }
#ifndef NO_WS
  // 1st warp-shuffle reduction
  int lane = threadIdx.x & (warpSize-1);
  int warpID = threadIdx.x >> 5; // assumes warpSize == 32
  unsigned mask = 0xFFFFFFFFU;
  for (int offset = warpSize>>1; offset > 0; offset >>= 1)
    sum += __shfl_down_sync(mask, sum, offset);
  if (lane == 0) sd[warpID] = sum;
  __syncthreads(); // put warp results in shared mem
  // hereafter, just warp 0
  if (warpID == 0){
  // reload val from shared mem if warp existed
    sum = sd[lane];
  // final warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(mask, sum, offset);
  }
#else
  sd[threadIdx.x] = sum;
  for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction
    __syncthreads();
    if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];}
  if (!threadIdx.x) sum = sd[0];
#endif
  if (!threadIdx.x) c[k] = sum*0.5f;
}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
  free(ic);
}
$ nvcc -arch=sm_52 -o t1497 t1497.cu
$ ./t1497
blockSize: 1024
gridSize: 10
Time taken: 0.001687s
no error
Vector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]
Time taken: 0.000350s
no error
Vector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]
Max error = 0.0137787
$

(更改-arch开关以匹配您的GPU)

以上示例显示,修改后的算法运行速度提高了约5倍(在Tesla V100上)。尽管存在数值差异,但这是由于浮点问题引起的。为了证明算法给出了正确的结果,请将typedeffloat切换到double。您将看到结果基本上不再存在任何数值差异(建议算法在逻辑上是相同的),并且float分辨率的改进算法版本为数值上的前10个元素提供了答案更接近于double算法产生的“更准确”的结果。

如评论所述,此算法转换可能并非在每种情况下都是有益的。主要好处将来自开发具有更大线程容量(大于N线程)的GPU。相对较小的GPU(例如,对于N = 10000,可能为8个SM或更少)可能无法从中受益,并且实际上代码的运行速度可能会比原始算法慢。

尽管我提到了合并,但对于N = 10000,此处的输入数据非常小(〜80K字节),将适合大多数GPU的L2缓存。一旦数据位于L2缓存中,低效的访问模式就不再是问题。因此,在这种情况下,该算法的主要好处可能是由于第1项。如果无法利用第1项,则该算法几乎没有收益。

出于测试目的,我使用了warp-stride循环创建了另一个版本。但是,在小型GPU上,它似乎并没有明显更快,而在V100上,它实际上却要慢一些:

#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
}
// assume one warp per c_k coefficient
// assume a multiple-of-32 threadblock size
const int nTPB = 32*8;
const int warpSize_p2 = 5; // assumes warpSize == 32
const int nWarps = nTPB>>warpSize_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
  int warpID = threadIdx.x >> warpSize_p2;
  int k = blockIdx.x*(nWarps)+warpID;
  if (k < n){
    mt sum = 0.0f;
    int lane = threadIdx.x & (warpSize-1);
    int row_width = (((k)>(n-k))?(k):(n-k))+1;
    int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0);
    int j = lane;
    mt tmp_a;
    for (int s=0; s < strides; s++){ // warp-stride loop
      if (j < n) tmp_a = a[j];
      if (j <= k) sum += tmp_a*b[k-j];
      if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
      j += warpSize;
      }
  // warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
    if (lane==0) c[k] = sum*0.5f;}
}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
  free(ic);
}