CUDA在不同场景下的原子操作性能

时间:2014-03-13 01:15:36

标签: cuda gpu gpgpu nvidia atomic

当我在SO上遇到this question时,我很想知道答案。所以我在下面写了一段代码来测试不同场景下的原子操作性能。操作系统是带有CUDA 5.5的Ubuntu 12.04,设备是GeForce GTX780(开普勒架构)。我用-O3标志编译了代码,CC = 3.5。

#include <stdio.h>

static void HandleError( cudaError_t err, const char *file, int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define BLOCK_SIZE 256
#define RESTRICTION_SIZE 32

__global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+i, 6);  //arbitrary number to add
    }
}

__global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6);   //arbitrary number to add
    }
}

__global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+(i>>5), 6); //arbitrary number to add
    }
}

__global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data, 6);    //arbitrary number to add
    }
}

__global__ void CoalescedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+threadIdx.x, data[i]);
    }
}

__global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]);
    }
}

__global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]);

    }
}

__global__ void SameAddressAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data, data[0]);
    }
}

int main(void)
{

    const int n = 2 << 24;
    int* data = new int[n];

    int i;
    for(i=0; i<n; i++) {
        data[i] = i%1024+1;
    }

    int* dev_data;
    HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) );
    HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) );
    HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    HANDLE_ERROR( cudaDeviceReset() );
    printf("Program finished without error.\n");
    return 0;
}

基本上在上面的代码中有8个内核,其中所有线程对所有数据都atomicAdd

  1. 在全局记忆中合并原子加成。
  2. 全局内存中受限地址空间的原子添加。 (代码中的32)
  3. 经线通道的原子添加在全局存储器中的相同地址上。
  4. 在全局内存中的同一地址上添加所有线程的原子。
  5. 通过将上述项目中的共享替换为global,可以找到第5项到第8项。选择的块大小为256。

    我使用nvprof来分析程序。输出是:

    Time(%)      Time     Calls       Avg       Min       Max  Name
    44.33%  2.35113s        50  47.023ms  46.987ms  47.062ms  SameAddressAtomicOnSharedMem(int*, int)
    31.89%  1.69104s        50  33.821ms  33.818ms  33.826ms  SameAddressAtomicOnGlobalMem(int*, int)
    10.10%  535.88ms        50  10.718ms  10.707ms  10.738ms  WarpRestrictedAtomicOnSharedMem(int*, int)
    3.96%  209.95ms        50  4.1990ms  4.1895ms  4.2103ms  AddressRestrictedAtomicOnSharedMem(int*, int)
    3.95%  209.47ms        50  4.1895ms  4.1893ms  4.1900ms  AddressRestrictedAtomicOnGlobalMem(int*, int)
    3.33%  176.48ms        50  3.5296ms  3.5050ms  3.5498ms  WarpRestrictedAtomicOnGlobalMem(int*, int)
    1.08%  57.428ms        50  1.1486ms  1.1460ms  1.1510ms  CoalescedAtomicOnGlobalMem(int*, int)
    0.84%  44.784ms        50  895.68us  888.65us  905.77us  CoalescedAtomicOnSharedMem(int*, int)
    0.51%  26.805ms         1  26.805ms  26.805ms  26.805ms  [CUDA memcpy HtoD]
    0.01%  543.61us         1  543.61us  543.61us  543.61us  [CUDA memset]
    

    显然,合并无冲突的原子操作具有最佳性能,同一地址的性能最差。我无法解释的一件事是,与全局内存(在所有线程之间通用)相比,为什么共享内存(块内)的相同地址原子速度较慢。 当所有warp通道访问共享内存中的相同位置时,性能非常差,但是当它们在全局内存上执行它时(令人惊讶)并非如此。我无法解释原因。另一个令人困惑的情况是全局的地址限制原子性能比warp中的所有线程在同一地址上执行时更糟糕,而第一种情况下的内存争用似乎更低。

    无论如何,如果有人能解释上述分析结果,我会很高兴。

1 个答案:

答案 0 :(得分:7)

作为前瞻性陈述,在某种程度上,我在这里的评论可能是特定于架构的。但是对于手头的架构(​​高达cc 3.5,AFAIK),共享内存原子通过代码序列(由汇编器创建)实现。如果多个线程争用对同一个银行/位置的访问权,则在共享内存上运行的此代码序列将被序列化。

RMW操作本身是原子的,没有其他线程可以破坏操作(即创建不正确的结果),但是当线程争用在单个共享内存位置上进行原子操作时,争用就产生了序列化,加剧了与原子相关的延迟。

the CUDA Handbook引用Nick:

  

与使用单个指令(GATOM或GRED,根据是否使用返回值)实现原子的全局内存不同,共享内存原子是使用显式锁定/解锁语义实现的,并且编译器会发出导致每个线程的代码循环这些锁操作,直到线程执行了原子操作。

  

注意避免争用,或者清单8-2中的循环最多可迭代32次。

我建议你至少阅读完整的8.1.5部分。