一些元素不是在cuda的矢量加法中计算的

时间:2017-12-20 15:41:26

标签: c cuda

以下是代码:

#include "common/book.h"

#define N 36 

__global__ void add(int *a, int *b, int *c) {
    int tid = blockIdx.x * gridDim.y * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 
    if(tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N]; 
    int *dev_a, *dev_b, *dev_c; 
    cudaMalloc( (void**) &dev_a, N * sizeof(int));
    cudaMalloc( (void**) &dev_b, N * sizeof(int));
    cudaMalloc( (void**) &dev_c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = -1; 
        b[i] = i * i;
    }

    cudaMemcpy(
                dev_a, 
                a, 
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    cudaMemcpy(
                dev_b, 
                b, 
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    dim3 grid_dim(3, 2);
    dim3 block_dim(3, 2);
    add<<<grid_dim, block_dim>>>(dev_a, dev_b, dev_c);
    cudaMemcpy(
                c, 
                dev_c, 
                N * sizeof(int),
                cudaMemcpyDeviceToHost
                   );
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}

基本上,我试图在具有3x2布局的网格上逐个元素地添加两个向量,网格中的每个块具有3x2线程布局。

以下是运行已编译二进制文件时的结果:

-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 0
-1 + 49 = 0
-1 + 64 = 0
-1 + 81 = 0
-1 + 100 = 0
-1 + 121 = 0
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 0
-1 + 361 = 0
-1 + 400 = 0
-1 + 441 = 0
-1 + 484 = 0
-1 + 529 = 0
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 0
-1 + 961 = 0
-1 + 1024 = 0
-1 + 1089 = 0
-1 + 1156 = 0
-1 + 1225 = 0

显然有些街区被忽略了。我也尝试过如何在内核函数tid中计算add,但总会丢失一些块。

有什么建议吗?

1 个答案:

答案 0 :(得分:2)

唯一的问题是你已经猜测的tid计算。

有许多方法可以执行映射并创建算法。对于通用2D网格,我发现在x和y中创建2D索引很方便(即易于记忆的方法),然后使用网格宽度(在x中)乘以y索引加上x索引,创建一个线程唯一的1-D索引:

int idy = threadIdx.y+blockDim.y*blockIdx.y;  // y-index
int idx = threadIdx.x+blockDim.x*blockIdx.x;  // x-index
int tid = gridDim.x*blockDim.x*idy + idx;     // thread-unique 1D index

gridDim.x*blockDim.x是x中的网格宽度,以线程为单位表示。

当我们在您的代码中使用这种通用的二维索引方案时,它似乎对我来说正常工作:

$ cat t10.cu
#include <stdio.h>

#define N 36

__global__ void add(int *a, int *b, int *c) {
    int idy = threadIdx.y+blockDim.y*blockIdx.y;
    int idx = threadIdx.x+blockDim.x*blockIdx.x;
    int tid = gridDim.x*blockDim.x*idy + idx;
    if(tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;
    cudaMalloc( (void**) &dev_a, N * sizeof(int));
    cudaMalloc( (void**) &dev_b, N * sizeof(int));
    cudaMalloc( (void**) &dev_c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = -1;
        b[i] = i * i;
    }

    cudaMemcpy(
                dev_a,
                a,
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    cudaMemcpy(
                dev_b,
                b,
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    dim3 grid_dim(3, 2);
    dim3 block_dim(3, 2);
    add<<<grid_dim, block_dim>>>(dev_a, dev_b, dev_c);
    cudaMemcpy(
                c,
                dev_c,
                N * sizeof(int),
                cudaMemcpyDeviceToHost
                   );
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}
$ nvcc -arch=sm_35 -o t10 t10.cu
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 35
-1 + 49 = 48
-1 + 64 = 63
-1 + 81 = 80
-1 + 100 = 99
-1 + 121 = 120
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 323
-1 + 361 = 360
-1 + 400 = 399
-1 + 441 = 440
-1 + 484 = 483
-1 + 529 = 528
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 899
-1 + 961 = 960
-1 + 1024 = 1023
-1 + 1089 = 1088
-1 + 1156 = 1155
-1 + 1225 = 1224
========= ERROR SUMMARY: 0 errors
$

以上应该提供正确的结果。在性能方面,这可能不是这个玩具问题最有效的映射。此问题的线程块大小不是32的倍数,这通常不建议用于高效的CUDA编程。对于这种情况,我没有尝试提出最佳映射(在性能/效率方面),我的建议是重新组织你的线程块以提供每个块至少32个线程的倍数,我还建议至少考虑块的x维中有16或32个线程,使索引易于理解,并产生近似最佳的内存访问性能。