CUDA-在一轴上平行缩小

时间:2018-07-25 19:10:55

标签: cuda shared-memory reduction

我对CUDA编程还很陌生,我正在尝试编写CUDA内核,以便仅对3维张量的1维进行并行约简,该3维张量是馈入内核的行主要的扁平浮点数组。

换句话说,我正在尝试使用轴= 0,轴= 1和轴= 2的有限轴组合重写numpy.sum

我已成功实现“在轴0上缩小”和“在轴1上缩小”,但是“在轴2上缩小”的性能问题使我在此处提出了问题,以寻求建议。

内核以一维网格和一维块配置启动,它将每个线程映射到输出张量减少的每个元素中。因此,应该是这样的: Link to image

这是我的内核:

__global__ void kernel_reduce_sum_3d_try02(
        float* g_idata,
        float* g_odata,
        int dim0,
        int dim1,
        int dim2,
        int overaxis0,
        int overaxis1,
        int overaxis2)
{

    if (overaxis0 == 0 && overaxis1 == 0 && overaxis2 == 1) { 
        // static shared memory
        __shared__ float smem_store[BLOCK_SIZE];

        // set thread ID
        //unsigned int tid = threadIdx.x;
        unsigned int tid =  threadIdx.x;

        // global index
        unsigned int idx = (blockIdx.x * blockDim.x + threadIdx.x);

        // unrolling
        float tmpSum0 = 0;
        unsigned int i = 0;
        unsigned int src_index ;
        unsigned int _limit = (unsigned int)(dim0 * dim1 * dim2);

        //Indices over output 
        int thrd_d0 = (idx) / (dim1*1);
        int thrd_d1 = (idx - thrd_d0*dim1);

        //Only for debugging 
        printf("tid: %03d \tidx: %03d\td0: %02d\td1: %02d\n",tid,idx,thrd_d0,thrd_d1);



        for (i = 0; i < dim2; i++) {
            src_index = thrd_d0*dim1*dim2 + thrd_d1 * dim2 + i;
            if(idx<15) printf("idx: %d : src_index: %d\n",idx,src_index);
            if(src_index < _limit)
                tmpSum0 += g_idata[src_index];
        }


        if (src_index + 0 < _limit) smem_store[tid + 0] = tmpSum0;
        __syncthreads();

        unsigned int oindx = (unsigned int)( thrd_d0*dim1 + thrd_d1 );
        if (src_index + 0 <= _limit) g_odata[oindx + 0] = smem_store[tid + 0];
    }
}

问题1.是否有更好的方法分配线程和块来计算输出张量?

问题2.如何在内核中增加占用率? (大约占50%)

问题3.如何将共享内存用于全局内存读取操作? (如果有较大的“ dim2”,则每个块应分配巨大的共享内存缓冲区,这不利于并发执行和占用)

1 个答案:

答案 0 :(得分:2)

任何CUDA程序员的前两个优化目标是:

  1. 公开足够的并行性(大致:能够使用很多线程)
  2. 有效利用内存子系统

对于全局内存,目标2意味着我们在读取或写入全局内存时应争取合并访问。合并访问的一个示例是当扭曲中的相邻线程正在读取内存中的相邻位置时。

您的内核这样做吗?它不是。这是一个简单的测试用例:

$ cat t1.cu
#include <stdio.h>
#define BLOCK_SIZE 256

__global__ void kernel_reduce_sum_3d_try02(
        float* g_idata,
        float* g_odata,
        int dim0,
        int dim1,
        int dim2,
        int overaxis0,
        int overaxis1,
        int overaxis2)
{

    if (overaxis0 == 0 && overaxis1 == 0 && overaxis2 == 1) {
        // static shared memory
        __shared__ float smem_store[BLOCK_SIZE];

        // set thread ID
        //unsigned int tid = threadIdx.x;
        unsigned int tid =  threadIdx.x;

        // global index
        unsigned int idx = (blockIdx.x * blockDim.x + threadIdx.x);

        // unrolling
        float tmpSum0 = 0;
        unsigned int i = 0;
        unsigned int src_index ;
        unsigned int _limit = (unsigned int)(dim0 * dim1 * dim2);

        //Indices over output
        int thrd_d0 = (idx) / (dim1*1);
        int thrd_d1 = (idx - thrd_d0*dim1);

        //Only for debugging
        //printf("tid: %03d \tidx: %03d\td0: %02d\td1: %02d\n",tid,idx,thrd_d0,thrd_d1);



        for (i = 0; i < dim2; i++) {
            src_index = thrd_d0*dim1*dim2 + thrd_d1 * dim2 + i;
            //if(idx<15) printf("idx: %d : src_index: %d\n",idx,src_index);
            if(src_index < _limit){
                tmpSum0 += g_idata[src_index];
                if ((blockIdx.x == 0) && (i == 0) && (threadIdx.x < 32)) printf("thread: %d, src_index: %u\n", threadIdx.x, src_index);
        }


        if (src_index + 0 < _limit) smem_store[tid + 0] = tmpSum0;
        __syncthreads();

        unsigned int oindx = (unsigned int)( thrd_d0*dim1 + thrd_d1 );
        if (src_index + 0 <= _limit) g_odata[oindx + 0] = smem_store[tid + 0];
    }
  }
}


int main(){



  size_t dim = 256;
  size_t sz = dim*dim*dim*sizeof(float);
  float *g_idata, *g_odata;
  cudaMalloc(&g_idata, sz);
  cudaMalloc(&g_odata, sz);
  kernel_reduce_sum_3d_try02<<<dim/BLOCK_SIZE, BLOCK_SIZE>>>(
        g_idata,
        g_odata,
        dim,
        dim,
        dim,
        0,
        0,
        1);
  cudaDeviceSynchronize();
}
$ nvcc -o t1 t1.cu
$ cuda-memcheck ./t1
========= CUDA-MEMCHECK
thread: 0, src_index: 0
thread: 1, src_index: 256
thread: 2, src_index: 512
thread: 3, src_index: 768
thread: 4, src_index: 1024
thread: 5, src_index: 1280
thread: 6, src_index: 1536
thread: 7, src_index: 1792
thread: 8, src_index: 2048
thread: 9, src_index: 2304
thread: 10, src_index: 2560
thread: 11, src_index: 2816
thread: 12, src_index: 3072
thread: 13, src_index: 3328
thread: 14, src_index: 3584
thread: 15, src_index: 3840
thread: 16, src_index: 4096
thread: 17, src_index: 4352
thread: 18, src_index: 4608
thread: 19, src_index: 4864
thread: 20, src_index: 5120
thread: 21, src_index: 5376
thread: 22, src_index: 5632
thread: 23, src_index: 5888
thread: 24, src_index: 6144
thread: 25, src_index: 6400
thread: 26, src_index: 6656
thread: 27, src_index: 6912
thread: 28, src_index: 7168
thread: 29, src_index: 7424
thread: 30, src_index: 7680
thread: 31, src_index: 7936
========= ERROR SUMMARY: 0 errors
$

我们看到在一次特定的访问中,经线中的线程正在从索引距离为256的位置读取(理想情况下,当我们在线程中从一个线程到另一个线程时,我们希望“距离”为1)扭曲,以便“聚集”访问全局内存。

那么这可能吗?尽管在每种情况下都需要采用一些不同的方法/实现,但是对于三个求和方向中的每一个都是有可能的。我们将回到这一点。

我们还希望公开足够的并行性,这可以粗略地解释为“能够启动具有约10,000个或更多线程的内核”。这里的行为会因GPU架构和其他因素而异,但这是进行一般理解的合理起点。例如,总共有256个线程的内核不足以使任何当前的CUDA GPU饱和。

鉴于此求和函数将对3维数据起作用,让我们从考虑每维约256个元素的3维数组开始。如果我们选择仅创建256个线程并在数组中工作,那么对于目标1来说太小了。因此,让我们考虑一下可以处理256x256(或更多)线程的实现。

情况1:沿X方向求和

我将假定使用C型存储,因此X方向将在内存中线性存储。因此,当我们从一个元素到另一个元素前进时,我们会将内存存储索引增加1。因此,沿着这些“行”求和将需要相邻线程读取属于特定总和的相邻元素。因此,为了允许相邻的线程执行此操作,然后合作并共同形成总和,我们将使用classical parallel reduction method。我们将选择每块256个线程(每个块协作以形成一个和)的网格,每个和一个块(在这种情况下,总共65536个块,因此共有65536 * 256个线程,满足我们的第一个目标),每个块256个线程将负责计算一行总和。为方便起见,我们将选择等于数组的Y维(在我们的例子中为256)乘以数组的Z维(在我们的示例中为256)的块数,并选择256个线程。每个块将负责计算一个和。这将是唯一使用或需要共享内存的情况。

情况2:沿Y方向求和

我们可以将其称为“沿Y方向累加的列”。为了满足我们的第二个目标,我们要求warp中的每个线程读取一个相邻的元素,但是内存中的相邻元素现在属于单独的Y列总和。在这种情况下,如果我们可以保持足够的线程繁忙,那么一种有效的实现方式是在每个线程中计算一个单独的和。每个线程都向下遍历一个Y列,并将运行总和保持在循环中。单个Z-sheet(XY平面中的一个切片)总共只需要256个线程即可计算出示例中的线程,但是我们可以通过同时处理所有256张(在我们示例中)的表来增加线程数时间。因此,我们可以使用256x256 = 65536个线程,以满足我们的第一个目标。

案例3:在Z方向求和(您在问题中遇到的示例案例)

这种情况只是情况2的一个小变化。再次,为了满足目标2,我们希望扭曲中的相邻线程读取内存中的相邻位置。再一次,这些属于不同的总和。但是,现在,我们希望线程在Z方向而不是Y方向遍历列。因此,索引编制将稍有不同,但是总体上实现与案例2十分相似。我们还将采用256x256线程的网格,其中每个线程保持一个单独的运行总和。但是,每个运行总和的起点将是第一个“工作表”,然后在Z方向上遍历下一个工作表和下一个工作表,将“ Z列”相加,每个线程一个。

下面是一个演示几种情况的示例:

$ cat t2.cu
#include <stdio.h>
#include <assert.h>

// modifiable
typedef float mytype;  // consider possibility of overflow e.g. for char types
const size_t ddimX = 32768;
const size_t ddimY = 100;
const size_t ddimZ = 100;

//don't modify
const int nTPB=256;

template <typename T>
__inline__ __device__
T warpReduceSum(T val) {
  for (int offset = warpSize/2; offset > 0; offset /= 2)
    val += __shfl_down_sync(0xFFFFFFFF, val, offset); // requires CUDA 9+
  return val;
}


template <typename T>
__inline__ __device__
T blockReduceSum(T val) {

  static __shared__ T shared[32]; // Shared mem for 32 partial sums
  int lane = threadIdx.x % warpSize;
  int wid = threadIdx.x / warpSize;
  val = warpReduceSum(val);     // Each warp performs partial reduction
  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
  __syncthreads();              // Wait for all partial reductions

                      //read from shared memory only if that warp existed
  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
  return val;
}

template <typename T>
__global__ void kernel_reduce_sum_3d(
        const T * __restrict__  g_idata,
        T * __restrict__  g_odata,
        const size_t dim0,
        const size_t dim1,
        const size_t dim2,
        const bool overaxis0,
        const bool overaxis1,
        const bool overaxis2)
{
  if (overaxis0){
    size_t bidx = blockIdx.x;
    size_t tidx = threadIdx.x;
    size_t limit;
    size_t base;
    T res = 0;
    if (!overaxis1 && !overaxis2){
    // Case 1 - sums in X-direction
    // each threadblock is responsible for a separate row sum
      limit = dim0;
      base = bidx*dim0;
      while (tidx < limit){
        res += g_idata[base+tidx];
        tidx += blockDim.x;}} // block-stride loop
    else if (overaxis1 && !overaxis2){
    // Case 4 - sums in X-Y plane
    // each threadblock will be responsible for an X-Y plane
      limit = dim0*dim1;
      base = bidx*dim0*dim1;
      while (tidx < limit){
        res += g_idata[base+tidx];
        tidx += blockDim.x;}} // block-stride loop
    else if (!overaxis1 && overaxis2){
    // Case 5 - sums in X-Z plane
    // each threadblock will be responsible for an X-Z plane
      for (int i = 0; i < dim2; i++){
      tidx = threadIdx.x;
      limit = dim0;
      base = (bidx*dim0)+(i*dim0*dim1);
      while (tidx < limit){
        res += g_idata[base+tidx];
        tidx += blockDim.x;}}} // block-stride loop
    else assert(0); // not implemented! - the remaining case here is all 3 axes selected
#ifndef USE_WS
    __shared__ T sm[nTPB];
    sm[threadIdx.x] = res;
    __syncthreads();
    // parallel reduction
    for (int i = blockDim.x>>1; i > warpSize; i>>=1){
      if (threadIdx.x < i) sm[threadIdx.x] += sm[threadIdx.x + i];
      __syncthreads();}
    for (int i = (blockDim.x == warpSize)?warpSize>>1:warpSize; i > 0; i>>=1){
      if (threadIdx.x < i) sm[threadIdx.x] += sm[threadIdx.x + i];
      if (threadIdx.x < warpSize) __syncwarp();}
    if (!threadIdx.x) g_odata[bidx] = sm[0];
#else
    res = blockReduceSum(res);
    if (!threadIdx.x) g_odata[bidx] = res;
#endif
    }
  else if (!overaxis0 && overaxis1 && !overaxis2){
    // Case 2 - sums in Y-direction
    // each thread is responsible for a separate Y-column sum
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < (dim0*dim2)){
      size_t tidx = idx%dim0 + (idx/dim0)*(dim0*dim1);
      T tsum = 0;
      for (size_t i = 0; i < dim1; i++){
        tsum += g_idata[tidx];
        tidx += dim0;}
      g_odata[idx] = tsum;
      }
    }
  else if (!overaxis0 && overaxis1 && overaxis2){
    // Case 6 - sums in Y-Z plane
    // each thread is responsible for a separate Y-Z plane sum (probably not optimal)
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < (dim0)){
      size_t tidx = idx;
      T tsum = 0;
      for (size_t i = 0; i < dim1*dim2; i++){
        tsum += g_idata[tidx];
        tidx += dim0;}
      g_odata[idx] = tsum;
      }
    }
  else if (!overaxis0 && !overaxis1 && overaxis2){
    // Case 3 - sums in Z-direction
    // each thread is responsible for a separate Z-column sum
    size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
    if (idx < (dim0*dim1)){
      size_t tidx = idx;
      T tsum = 0;
      for (size_t i = 0; i < dim2; i++){
        tsum += g_idata[tidx];
        tidx += dim0*dim1;}
      g_odata[idx] = tsum;
      }
    }
  else assert(0); // not implemented! - the remaining case here is no axes selected
}

template <typename T>
bool validate(T *data, size_t dim, T val){
  for (size_t i = 0; i < dim; i++)
    if (data[i] != val) {printf("mismatch at %lu, was: %f, should be: %f\n", i, (float)(data[i]), (float)val); return false;}
  return true;
}

size_t choose_block_size(size_t val){
  if (val >= nTPB) return nTPB;
  if (val <= 32) return 32;
  val = (val >> 1) | val;
  val = (val >> 2) | val;
  val = (val >> 4) | val;
  val = (val >> 8) | val;
  val = (val >> 16) | val;
  val++;
  return val;
}

int main(int argc, char *argv[]){
  size_t dimX = ddimX;
  size_t dimY = ddimY;
  size_t dimZ = ddimZ;
  if (argc > 1) {
    size_t nx = atoi(argv[1]);
    dimX = nx;
    dimY = nx-1;
    dimZ = nx-2;}
  // setup input array of all 1
  const size_t sz = dimX*dimY*dimZ*sizeof(mytype);
  mytype *d_in, *d_out, *h_in, *h_out;
  size_t rsz;
  h_in = new mytype[dimX*dimY*dimZ];
  assert(h_in != NULL);
  cudaError_t err = cudaMalloc(&d_in, sz);
  for (size_t i = 0; i < dimX*dimY*dimZ; i++) h_in[i] = 1;
  if (err != cudaSuccess) {printf("cudaMalloc1 error: %s\n", cudaGetErrorString(err)); return -1;}
  for (size_t i = 0; i < dimX*dimY*dimZ; i++) h_in[i] = 1;
  err = cudaMemcpy(d_in, h_in, sz, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {printf("cudaMemcpy1 error: %s\n", cudaGetErrorString(err)); return -1;}
  // Test X-direction sums
  printf("testing X-direction sums\n");
  rsz = dimY*dimZ*sizeof(mytype);
  h_out=new mytype[dimY*dimZ];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc2 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset1 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<dimY*dimZ, choose_block_size(dimX)>>>(d_in, d_out, dimX, dimY, dimZ, true, false, false);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result1 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimY*dimZ, (mytype)dimX))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test Y-direction sums
  printf("testing Y-direction sums\n");
  rsz = dimX*dimZ*sizeof(mytype);
  h_out=new mytype[dimX*dimZ];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc3 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset2 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<((dimX*dimZ)+(nTPB-1))/nTPB, nTPB>>>(d_in, d_out, dimX, dimY, dimZ, false, true, false);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result2 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimX*dimZ, (mytype)dimY))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test Z-direction sums
  printf("testing Z-direction sums\n");
  rsz = dimX*dimY*sizeof(mytype);
  h_out=new mytype[dimX*dimY];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc4 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset3 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<((dimX*dimY)+(nTPB-1))/nTPB, nTPB>>>(d_in, d_out, dimX, dimY, dimZ, false, false, true);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result3 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimX*dimY, (mytype)dimZ))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test X-Y plane sums
  printf("testing X-Y plane sums\n");
  rsz = dimZ*sizeof(mytype);
  h_out=new mytype[dimZ];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc5 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset4 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<dimZ, nTPB>>>(d_in, d_out, dimX, dimY, dimZ, true, true, false);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result4 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimZ, (mytype)(dimX*dimY)))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test X-Z plane sums
  printf("testing X-Z plane sums\n");
  rsz = dimY*sizeof(mytype);
  h_out=new mytype[dimY];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc6 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset5 error: %s\n", cudaGetErrorString(err)); return -1;}
  kernel_reduce_sum_3d<<<dimY, choose_block_size(dimX)>>>(d_in, d_out, dimX, dimY, dimZ, true, false, true);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result5 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimY, (mytype)(dimX*dimZ)))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  // Test Y-Z plane sums
  printf("testing Y-Z plane sums\n");
  rsz = dimX*sizeof(mytype);
  h_out=new mytype[dimX];
  err = cudaMalloc(&d_out, rsz);
  if (err != cudaSuccess) {printf("cudaMalloc7 error: %s\n", cudaGetErrorString(err)); return -1;}
  err = cudaMemset(d_out, 0, rsz);
  if (err != cudaSuccess) {printf("cudaMemset6 error: %s\n", cudaGetErrorString(err)); return -1;}
  size_t tpb = choose_block_size(dimX);
  kernel_reduce_sum_3d<<<(dimX+tpb-1)/tpb, tpb>>>(d_in, d_out, dimX, dimY, dimZ, false, true, true);
  err = cudaMemcpy(h_out, d_out, rsz, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {printf("result6 error: %s\n", cudaGetErrorString(err)); return -1;}
  if (!validate(h_out, dimX, (mytype)(dimY*dimZ)))  return -1;
  cudaFree(d_out);
  delete[] h_out;
  cudaFree(d_in);
  err=cudaGetLastError();
  if (err != cudaSuccess) {printf("CUDA error: %s\n", cudaGetErrorString(err)); return -1;}
  printf("success!\n");
  delete[] h_in;
  return 0;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
testing X-direction sums
testing Y-direction sums
testing Z-direction sums
testing X-Y plane sums
testing X-Z plane sums
testing Y-Z plane sums
success!
========= ERROR SUMMARY: 0 errors
$ nvprof --print-gpu-trace ./t2
==11084== NVPROF is profiling process 11084, command: ./t2
testing X-direction sums
testing Y-direction sums
testing Z-direction sums
testing X-Y plane sums
testing X-Z plane sums
testing Y-Z plane sums
success!
==11084== Profiling application: ./t2
==11084== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
2.32676s  478.32ms                    -               -         -         -         -  1.2207GB  2.5521GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
2.80537s  5.2480us                    -               -         -         -         -  39.063KB  7.0985GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.80596s  39.427ms          (10000 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [109]
2.84539s  7.2320us                    -               -         -         -         -  39.063KB  5.1511GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.84586s  1.2160us                    -               -         -         -         -  12.500MB   1e+04GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.84619s  22.838ms          (12800 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [114]
2.86904s  5.8913ms                    -               -         -         -         -  12.500MB  2.0721GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.88707s  1.1840us                    -               -         -         -         -  12.500MB   1e+04GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.88740s  23.046ms          (12800 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [119]
2.91046s  5.5715ms                    -               -         -         -         -  12.500MB  2.1910GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.92758s  2.6240us                    -               -         -         -         -      400B  145.38MB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.92762s  40.990ms            (100 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [124]
2.96861s  1.5360us                    -               -         -         -         -      400B  248.35MB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
2.96898s  2.6240us                    -               -         -         -         -      400B  145.38MB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
2.96900s  43.392ms            (100 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [129]
3.01239s  1.5680us                    -               -         -         -         -      400B  243.28MB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]
3.01277s  1.2160us                    -               -         -         -         -  128.00KB  100.39GB/s      Device           -  Quadro K2000 (0         1         7  [CUDA memset]
3.01279s  23.059ms            (128 1 1)       (256 1 1)        32  1.0000KB        0B         -           -           -           -  Quadro K2000 (0         1         7  void kernel_reduce_sum_3d<float>(float const *, float*, unsigned long, unsigned long, unsigned long, bool, bool, bool) [134]
3.03585s  20.928us                    -               -         -         -         -  128.00KB  5.8329GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

简要回答您的问题:

  

问题1.是否有更好的方法分配线程和块来计算输出张量?

如前所述,您对索引的选择不是最佳的,因为它不会导致对全局内存的合并访问。我提供的替代实现将导致全局内存中的合并读取。

  

问题2.如何在内核中增加占用率? (大约占50%)

此内存绑定内核的品质因数不是占用率,而是内核运行时是否大约等于读取和写入数据所花费的时间。以上内核应满足此要求。如果您对内存受限的内核满足此条件,则无论占用多少,都无法进一步改善。

  

问题3.如何将共享内存用于全局内存读取操作? (如果有较大的“ dim2”,则每个块应分配巨大的共享内存缓冲区,这不利于并发执行和占用)

如我所展示的那样,最佳实现完全不需要情况2和情况3(您的情况)的共享内存,而在情况1中,我们可以设计一个仅需要每个块少量共享内存的内核;不足以引起居住问题或引起任何其他问题。