我有一个3D内核,我目前在一个块上运行:
// The two following variables are set elsewhere in the program.
// I give them possible value here for demonstration purposes.
int* N = {14, 5, 1};
int L = 2; // N's size - 1
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.x * blockDim.y + threadIdx.y;
int idz = blockIdx.x * blockDim.z + threadIdx.z;
int idxInc = idx + 1; // for not to waste threads whose idx = 0
if (idxInc >= 1 && idxInc <= L)
{
if (idy < N[idxInc])
{
if (idz < N[idxInc-1])
{
dw[ idxInc ][ idy ][ idz ] = 0;
}
}
}
如果我在一个块上启动这个内核,其大小为{2,5,14},那么一切都会好的。这正是块的每个维度所需的线程数,内核可以对两个第一行中定义的数据执行任务。现在,我没有看到如何在多个块之间划分这项工作。我的大脑错误只是试图在两个块上找到每个维度的正确数量的线程。 此外,L可能会有所不同(但我可能会对此加以限制),更可能N [1]会有很大变化(在本例中为5,但可能是128,256或2048 ......)。所以我必须找到一种算法来自动平衡块的数量,以及块的三个维度中每个维度的线程数。
我真的不明白该怎么做,现在我觉得很蠢! 我开始认为我应该停止玩3维...或者也许有一个我看不到的简单技巧......
有些帮助吗? 谢谢!
编辑:连续检查结果......
for (layer = 1; layer <= L; layer++)
{
for (i = 0; i < N[layer]; i++)
{
for (j = 0; j < N[layer-1]; j++)
{
printf("%1.0f", dw[ layer ][ i ][ j ]);
}
printf("\n");
}
printf("\n");
}
显示的每个数字都应为0.
答案 0 :(得分:1)
这是一个简单的示例代码(我认为)你所描述的内容:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for simplicity assume grid is an even multiple of blocksizes
#define XSIZE 1024
#define YSIZE 14
#define ZSIZE 2
#define TSIZE (XSIZE*YSIZE*ZSIZE)
#define BLKX 16
#define BLKY 14
#define BLKZ 2
#define IDX(x,y,z) ((z*(XSIZE*YSIZE))+(y*XSIZE)+x)
typedef float mytype;
__global__ void mykernel(mytype *data){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
int idy = threadIdx.y + blockDim.y*blockIdx.y;
int idz = threadIdx.z + blockDim.z*blockIdx.z;
if ((idx < XSIZE)&&(idy < YSIZE)&&(idz < ZSIZE))
data[IDX(idx,idy,idz)] = (mytype)idx;
if ((idx==127)&&(idy==13)&&(idz==1)) printf("BONJOUR\n");
}
int main(){
// for simplicity assume grid is an even multiple of blocksizes
dim3 block(BLKX, BLKY, BLKZ);
dim3 grid(XSIZE/BLKX, YSIZE/BLKY, ZSIZE/BLKZ);
mytype *h_data, *d_data;
h_data=(mytype *)malloc(TSIZE*sizeof(mytype));
if (h_data == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_data, TSIZE*sizeof(mytype));
cudaCheckErrors("cudaMalloc fail");
for (int x=0; x<XSIZE; x++)
for (int y=0; y<YSIZE; y++)
for (int z=0; z<ZSIZE; z++)
h_data[IDX(x,y,z)] = (mytype)0;
cudaMemcpy(d_data, h_data, TSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
mykernel<<<grid, block>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_data, d_data, TSIZE*sizeof(mytype), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
for (int x=0; x<XSIZE; x++)
for (int y=0; y<YSIZE; y++)
for (int z=0; z<ZSIZE; z++)
if(h_data[IDX(x,y,z)] != (mytype)x) {printf("data check fail at (x,y,z) = (%d, %d, %d), was: %f, should be: %f\n", x,y,z, h_data[IDX(x,y,z)], x); return 1;}
printf("Data check passed!\n");
return 0;
}
编译:
nvcc -arch=sm_20 -o t159 t159.cu
当我运行它时,我得到:
BONJOUR
Data check passed!