Question

我正在从在线UDACITY课程学习CUDA编程。在第二课中给出了一个示例代码，其中有两个kernells，第一个__global__ void increment_naive(int *g)只是将1加到数组*g的元素中，该元素驻留在全局内存中。

根据UDACITY的整个代码如下：

#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE  100

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)
{
    printf("{ ");
    for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
    printf("}\n");
}

__global__ void increment_naive(int *g)
{
    // which thread is this?
    int i = blockIdx.x * blockDim.x + threadIdx.x; 

    // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
    i = i % ARRAY_SIZE;  
    g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g)
{
    // which thread is this?
    int i = blockIdx.x * blockDim.x + threadIdx.x; 

    // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
    i = i % ARRAY_SIZE;  
    atomicAdd(& g[i], 1);
}

int main(int argc,char **argv)
{   
    GpuTimer timer;
    printf("%d total threads in %d blocks writing into %d array elements\n",
           NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

    // declare and allocate host memory
    int h_array[ARRAY_SIZE];
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);

    // declare, allocate, and zero out GPU memory
    int * d_array;
    cudaMalloc((void **) &d_array, ARRAY_BYTES);
    cudaMemset((void *) d_array, 0, ARRAY_BYTES); 

    // launch the kernel - comment out one of these
    timer.Start();


    increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    //increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    timer.Stop();

    // copy back the array of sums from GPU and print
    cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    print_array(h_array, ARRAY_SIZE);
    printf("Time elapsed = %g ms\n", timer.Elapsed());

    // free GPU memory allocation and exit
    cudaFree(d_array);
    return 0;
}

根据该程序，一百万个拥有1000个块的线程正在写入10个数组元素。因此，每个数组元素的结果都是100000.

第一个内核无法生成所需的输出，因为线程不同步访问会产生不良结果。这可以使用诸如__syncthreads之类的障碍或使用原子操作来解决。

第二个kernell工作正常并产生正确的输出，如下：

1000000 total threads in 1000 blocks writing into 100 array elements
{ 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000 }
Time elapsed = 0.367648 ms

如前所述，第一个kernell每次产生错误的输出。

1000000 total threads in 1000 blocks writing into 100 array elements
{ 75 75 75 75 78 78 78 78 73 73 73 73 82 82 82 82 85 85 85 85 92 92 92 92 104 104 104 104 107 107 107 107 89 89 89 89 88 88 88 88 95 95 95 95 103 103 103 103 106 106 106 106 107 107 107 107 105 105 105 105 113 113 113 113 96 96 96 96 95 95 95 95 95 95 95 95 100 100 100 100 98 98 98 98 104 104 104 104 110 110 110 110 126 126 126 126 90 90 90 90 }
Time elapsed = 0.23392 ms

我试图通过在计算的不同阶段设置障碍来修复第一个kernell，但是我没有获得必要的输出。我修复第一个kernell的尝试如下：

    __global__ void increment_naive(int *g)
{
    // which thread is this?
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    __syncthreads();
    // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
    //i = i % ARRAY_SIZE;
    int temp = i%ARRAY_SIZE;
    __syncthreads();
    i = temp;
    __syncthreads();
    //g[i] = g[i] + 1;
    int temp1 = g[i]+1;
    __syncthreads();
    g[i] = temp1;
     __syncthreads();

}

我希望有人能引导我解决这个问题，因为这个问题困扰着我，阻碍了我对进一步发展的信心。

Answer 1

__syncthreads()函数确保块中的所有线程都位于代码中的相同位置。使用它们不会达到你想要的效果。更糟糕的是 - 假设CUDA是一个完美的并行机器，所有线程都在锁步。你永远不需要任何__syncthreads。不过，你会有不同的结果。考虑以下伪代码和正在进行的解释：

__perfect_parallel_machine__ void increment_naive(int *g)
{
    int idx = thisThreadIdx % ARRAY_SIZE;
    int local = g[idx];
                               //*all* threads load the initial value of g[idx]
                               //each thread holds a separate copy of 'local' variable
                               //local=0 in each thread
    local = local + 1;
                               //each thread increment its own private copy of 'local'
                               //local=1 for all threads
    g[idx] = local;
                               //each thread stores the same value (1) into global array
                               //g = {1, 1, 1, 1, 1, ...., 1}
}

由于CUDA 不是一台完美的并行机器，因此事情会发生故障，最终会在阵列中获得更高的值。增加同步障碍将使您更接近理想的{1, 1, ... , 1}结果。

还有其他屏障功能，例如__threadfence()。这个停止当前线程（只有当前线程！），直到保证其他线程可以看到存储到全局数组。这与L1 / L2缓存有关，与线程同步无关。例如，通常将__threadfence与原子一起使用来标记您已完成填写某些数据。

我认为你和导师之间肯定会有一些误解。我建议和他谈谈澄清一下......

如何在不使用原子

1 个答案: