当我在SO上遇到this question时,我很想知道答案。所以我在下面写了一段代码来测试不同场景下的原子操作性能。操作系统是带有CUDA 5.5的Ubuntu 12.04,设备是GeForce GTX780(开普勒架构)。我用-O3
标志编译了代码,CC = 3.5。
#include <stdio.h>
static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define BLOCK_SIZE 256
#define RESTRICTION_SIZE 32
__global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+i, 6); //arbitrary number to add
}
}
__global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6); //arbitrary number to add
}
}
__global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data+(i>>5), 6); //arbitrary number to add
}
}
__global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem)
{
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( data, 6); //arbitrary number to add
}
}
__global__ void CoalescedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+threadIdx.x, data[i]);
}
}
__global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]);
}
}
__global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]);
}
}
__global__ void SameAddressAtomicOnSharedMem(int* data, int nElem)
{
__shared__ int smem_data[BLOCK_SIZE];
unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
atomicAdd( smem_data, data[0]);
}
}
int main(void)
{
const int n = 2 << 24;
int* data = new int[n];
int i;
for(i=0; i<n; i++) {
data[i] = i%1024+1;
}
int* dev_data;
HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) );
HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) );
HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
for(int i=0; i<50; i++)
{
dim3 blocksize(BLOCK_SIZE);
dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
HANDLE_ERROR( cudaPeekAtLastError() );
}
HANDLE_ERROR( cudaDeviceSynchronize() );
HANDLE_ERROR( cudaDeviceReset() );
printf("Program finished without error.\n");
return 0;
}
基本上在上面的代码中有8个内核,其中所有线程对所有数据都atomicAdd
。
通过将上述项目中的共享替换为global,可以找到第5项到第8项。选择的块大小为256。
我使用nvprof
来分析程序。输出是:
Time(%) Time Calls Avg Min Max Name
44.33% 2.35113s 50 47.023ms 46.987ms 47.062ms SameAddressAtomicOnSharedMem(int*, int)
31.89% 1.69104s 50 33.821ms 33.818ms 33.826ms SameAddressAtomicOnGlobalMem(int*, int)
10.10% 535.88ms 50 10.718ms 10.707ms 10.738ms WarpRestrictedAtomicOnSharedMem(int*, int)
3.96% 209.95ms 50 4.1990ms 4.1895ms 4.2103ms AddressRestrictedAtomicOnSharedMem(int*, int)
3.95% 209.47ms 50 4.1895ms 4.1893ms 4.1900ms AddressRestrictedAtomicOnGlobalMem(int*, int)
3.33% 176.48ms 50 3.5296ms 3.5050ms 3.5498ms WarpRestrictedAtomicOnGlobalMem(int*, int)
1.08% 57.428ms 50 1.1486ms 1.1460ms 1.1510ms CoalescedAtomicOnGlobalMem(int*, int)
0.84% 44.784ms 50 895.68us 888.65us 905.77us CoalescedAtomicOnSharedMem(int*, int)
0.51% 26.805ms 1 26.805ms 26.805ms 26.805ms [CUDA memcpy HtoD]
0.01% 543.61us 1 543.61us 543.61us 543.61us [CUDA memset]
显然,合并无冲突的原子操作具有最佳性能,同一地址的性能最差。我无法解释的一件事是,与全局内存(在所有线程之间通用)相比,为什么共享内存(块内)的相同地址原子速度较慢。 当所有warp通道访问共享内存中的相同位置时,性能非常差,但是当它们在全局内存上执行它时(令人惊讶)并非如此。我无法解释原因。另一个令人困惑的情况是全局的地址限制原子性能比warp中的所有线程在同一地址上执行时更糟糕,而第一种情况下的内存争用似乎更低。
无论如何,如果有人能解释上述分析结果,我会很高兴。
答案 0 :(得分:7)
作为前瞻性陈述,在某种程度上,我在这里的评论可能是特定于架构的。但是对于手头的架构(高达cc 3.5,AFAIK),共享内存原子通过代码序列(由汇编器创建)实现。如果多个线程争用对同一个银行/位置的访问权,则在共享内存上运行的此代码序列将被序列化。
RMW操作本身是原子的,没有其他线程可以破坏操作(即创建不正确的结果),但是当线程争用在单个共享内存位置上进行原子操作时,争用就产生了序列化,加剧了与原子相关的延迟。
从the CUDA Handbook引用Nick:
与使用单个指令(GATOM或GRED,根据是否使用返回值)实现原子的全局内存不同,共享内存原子是使用显式锁定/解锁语义实现的,并且编译器会发出导致每个线程的代码循环这些锁操作,直到线程执行了原子操作。
和
注意避免争用,或者清单8-2中的循环最多可迭代32次。
我建议你至少阅读完整的8.1.5部分。