以下是代码:
#include "common/book.h"
#define N 36
__global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x * gridDim.y * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
if(tid < N) {
c[tid] = a[tid] + b[tid];
}
}
int main() {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**) &dev_a, N * sizeof(int));
cudaMalloc( (void**) &dev_b, N * sizeof(int));
cudaMalloc( (void**) &dev_c, N * sizeof(int));
for (int i = 0; i < N; i++) {
a[i] = -1;
b[i] = i * i;
}
cudaMemcpy(
dev_a,
a,
N * sizeof(int),
cudaMemcpyHostToDevice
);
cudaMemcpy(
dev_b,
b,
N * sizeof(int),
cudaMemcpyHostToDevice
);
dim3 grid_dim(3, 2);
dim3 block_dim(3, 2);
add<<<grid_dim, block_dim>>>(dev_a, dev_b, dev_c);
cudaMemcpy(
c,
dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost
);
for (int i = 0; i < N; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
基本上,我试图在具有3x2布局的网格上逐个元素地添加两个向量,网格中的每个块具有3x2线程布局。
以下是运行已编译二进制文件时的结果:
-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 0
-1 + 49 = 0
-1 + 64 = 0
-1 + 81 = 0
-1 + 100 = 0
-1 + 121 = 0
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 0
-1 + 361 = 0
-1 + 400 = 0
-1 + 441 = 0
-1 + 484 = 0
-1 + 529 = 0
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 0
-1 + 961 = 0
-1 + 1024 = 0
-1 + 1089 = 0
-1 + 1156 = 0
-1 + 1225 = 0
显然有些街区被忽略了。我也尝试过如何在内核函数tid
中计算add
,但总会丢失一些块。
有什么建议吗?
答案 0 :(得分:2)
唯一的问题是你已经猜测的tid
计算。
有许多方法可以执行映射并创建算法。对于通用2D网格,我发现在x和y中创建2D索引很方便(即易于记忆的方法),然后使用网格宽度(在x中)乘以y索引加上x索引,创建一个线程唯一的1-D索引:
int idy = threadIdx.y+blockDim.y*blockIdx.y; // y-index
int idx = threadIdx.x+blockDim.x*blockIdx.x; // x-index
int tid = gridDim.x*blockDim.x*idy + idx; // thread-unique 1D index
gridDim.x*blockDim.x
是x中的网格宽度,以线程为单位表示。
当我们在您的代码中使用这种通用的二维索引方案时,它似乎对我来说正常工作:
$ cat t10.cu
#include <stdio.h>
#define N 36
__global__ void add(int *a, int *b, int *c) {
int idy = threadIdx.y+blockDim.y*blockIdx.y;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int tid = gridDim.x*blockDim.x*idy + idx;
if(tid < N) {
c[tid] = a[tid] + b[tid];
}
}
int main() {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc( (void**) &dev_a, N * sizeof(int));
cudaMalloc( (void**) &dev_b, N * sizeof(int));
cudaMalloc( (void**) &dev_c, N * sizeof(int));
for (int i = 0; i < N; i++) {
a[i] = -1;
b[i] = i * i;
}
cudaMemcpy(
dev_a,
a,
N * sizeof(int),
cudaMemcpyHostToDevice
);
cudaMemcpy(
dev_b,
b,
N * sizeof(int),
cudaMemcpyHostToDevice
);
dim3 grid_dim(3, 2);
dim3 block_dim(3, 2);
add<<<grid_dim, block_dim>>>(dev_a, dev_b, dev_c);
cudaMemcpy(
c,
dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost
);
for (int i = 0; i < N; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
$ nvcc -arch=sm_35 -o t10 t10.cu
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 35
-1 + 49 = 48
-1 + 64 = 63
-1 + 81 = 80
-1 + 100 = 99
-1 + 121 = 120
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 323
-1 + 361 = 360
-1 + 400 = 399
-1 + 441 = 440
-1 + 484 = 483
-1 + 529 = 528
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 899
-1 + 961 = 960
-1 + 1024 = 1023
-1 + 1089 = 1088
-1 + 1156 = 1155
-1 + 1225 = 1224
========= ERROR SUMMARY: 0 errors
$
以上应该提供正确的结果。在性能方面,这可能不是这个玩具问题最有效的映射。此问题的线程块大小不是32的倍数,这通常不建议用于高效的CUDA编程。对于这种情况,我没有尝试提出最佳映射(在性能/效率方面),我的建议是重新组织你的线程块以提供每个块至少32个线程的倍数,我还建议至少考虑块的x维中有16或32个线程,使索引易于理解,并产生近似最佳的内存访问性能。