Question

我有一个3D内核，我目前在一个块上运行：

// The two following variables are set elsewhere in the program.
// I give them possible value here for demonstration purposes.
int* N = {14, 5, 1};
int L = 2; // N's size - 1

int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.x * blockDim.y + threadIdx.y;
int idz = blockIdx.x * blockDim.z + threadIdx.z;

int idxInc = idx + 1; // for not to waste threads whose idx = 0
if (idxInc >= 1 && idxInc <= L)
{
    if (idy < N[idxInc])
    {       
        if (idz < N[idxInc-1])
        {
            dw[ idxInc ][ idy ][ idz ] = 0;
        }
    }
}

如果我在一个块上启动这个内核，其大小为{2,5,14}，那么一切都会好的。这正是块的每个维度所需的线程数，内核可以对两个第一行中定义的数据执行任务。现在，我没有看到如何在多个块之间划分这项工作。我的大脑错误只是试图在两个块上找到每个维度的正确数量的线程。此外，L可能会有所不同（但我可能会对此加以限制），更可能N [1]会有很大变化（在本例中为5，但可能是128,256或2048 ......）。所以我必须找到一种算法来自动平衡块的数量，以及块的三个维度中每个维度的线程数。

我真的不明白该怎么做，现在我觉得很蠢！我开始认为我应该停止玩3维...或者也许有一个我看不到的简单技巧......

有些帮助吗？谢谢！

编辑：连续检查结果......

for (layer = 1; layer <= L; layer++)
{
    for (i = 0; i < N[layer]; i++)
    {
        for (j = 0; j < N[layer-1]; j++)
        {
            printf("%1.0f", dw[ layer ][ i ][ j ]);
        }
        printf("\n");
    }
    printf("\n");
}

显示的每个数字都应为0.

Answer 1

这是一个简单的示例代码（我认为）你所描述的内容：

#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for simplicity assume grid is an even multiple of blocksizes
#define XSIZE 1024
#define YSIZE 14
#define ZSIZE 2
#define TSIZE (XSIZE*YSIZE*ZSIZE)
#define BLKX 16
#define BLKY 14
#define BLKZ 2


#define IDX(x,y,z) ((z*(XSIZE*YSIZE))+(y*XSIZE)+x)

typedef float mytype;

__global__ void mykernel(mytype *data){

  int idx = threadIdx.x + blockDim.x*blockIdx.x;
  int idy = threadIdx.y + blockDim.y*blockIdx.y;
  int idz = threadIdx.z + blockDim.z*blockIdx.z;

  if ((idx < XSIZE)&&(idy < YSIZE)&&(idz < ZSIZE))
    data[IDX(idx,idy,idz)] = (mytype)idx;
  if ((idx==127)&&(idy==13)&&(idz==1)) printf("BONJOUR\n");
}

int main(){


// for simplicity assume grid is an even multiple of blocksizes
  dim3 block(BLKX, BLKY, BLKZ);
  dim3 grid(XSIZE/BLKX, YSIZE/BLKY, ZSIZE/BLKZ);

  mytype *h_data, *d_data;

  h_data=(mytype *)malloc(TSIZE*sizeof(mytype));
  if (h_data == 0) {printf("malloc fail\n"); return 1;}
  cudaMalloc((void **)&d_data, TSIZE*sizeof(mytype));
  cudaCheckErrors("cudaMalloc fail");

  for (int x=0; x<XSIZE; x++)
    for (int y=0; y<YSIZE; y++)
      for (int z=0; z<ZSIZE; z++)
        h_data[IDX(x,y,z)] = (mytype)0;

  cudaMemcpy(d_data, h_data, TSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy fail");

  mykernel<<<grid, block>>>(d_data);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");

  cudaMemcpy(h_data, d_data, TSIZE*sizeof(mytype), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy fail");


  for (int x=0; x<XSIZE; x++)
    for (int y=0; y<YSIZE; y++)
      for (int z=0; z<ZSIZE; z++)
        if(h_data[IDX(x,y,z)] != (mytype)x) {printf("data check fail at (x,y,z) = (%d, %d, %d), was: %f, should be: %f\n", x,y,z, h_data[IDX(x,y,z)], x); return 1;}
  printf("Data check passed!\n");


  return 0;
}

编译：

nvcc -arch=sm_20 -o t159 t159.cu

当我运行它时，我得到：

BONJOUR
Data check passed!

在块之间划分3D内核的工作

1 个答案: