我正在测试将原子加法运算插入到优化的数组缩减内核中以测量性能影响的效果。我没有理解结果。我测试了五种不同的内核:
0 - fully optimized reduction kernel as provided in samples/6_Advanced/reduction/reduction_kernel.cu
1 - optimized reduction kernel as described in samples/6_Advanced/docs/reduction.pdf
2 - kernel 1 with atomic warp-synchronous reduction
3 - kernel 2 with completely atomic reduction within all shared memory
4 - kernel 3 with completely atomic reduction
我在足够大的元素样本上使用的设备的平均缩短时间:
0 - 0.00103s
1 - 0.00103s
2 - 0.00103s
3 - 0.00103s
4 - 0.00117s
为什么原子操作似乎对内核2
或3
没有任何影响,对内核4
有一些小影响?
Here是完整代码。相关内核是:
/////////////////
// warp reduce //
/////////////////
/* warp-synchronous reduction using volatile memory
* to prevent instruction reordering for non-atomic
* operations */
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
////////////////////////
// atomic warp reduce //
////////////////////////
/* warp-synchronous reduction using atomic operations
* to serialize computation */
template <unsigned int blockSize>
__device__ void atomicWarpReduce(int *sdata, int tid) {
if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
if (blockSize >= 8) atomicAdd(&sdata[tid], sdata[tid + 4]);
if (blockSize >= 4) atomicAdd(&sdata[tid], sdata[tid + 2]);
if (blockSize >= 2) atomicAdd(&sdata[tid], sdata[tid + 1]);
}
////////////////////////
// reduction kernel 0 //
////////////////////////
/* fastest reduction algorithm provided by
* cuda/samples/6_Advanced/reduction/reduction_kernel.cu */
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
int sum = 0;
// reduce multiple elements per thread
while (i < n) {
sum += g_idata[i];
// check bounds
if (nIsPow2 || i + blockSize < n)
sum += g_idata[i + blockSize];
i += gridSize;
}
// local sum -> shared memory
sdata[tid] = sum;
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] = sum = sum + sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] = sum = sum + sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] = sum = sum + sdata[tid + 64];
__syncthreads();
}
if (tid < 32) {
// warp-synchronous reduction
// volatile memory stores won't be reordered by compiler
volatile int *smem = sdata;
if (blockSize >= 64)
smem[tid] = sum = sum + smem[tid + 32];
if (blockSize >= 32)
smem[tid] = sum = sum + smem[tid + 16];
if (blockSize >= 16)
smem[tid] = sum = sum + smem[tid + 8];
if (blockSize >= 8)
smem[tid] = sum = sum + smem[tid + 4];
if (blockSize >= 4)
smem[tid] = sum = sum + smem[tid + 2];
if (blockSize >= 2)
smem[tid] = sum = sum + smem[tid + 1];
}
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 1 //
/////////////////////////
/* fastest reduction alrogithm described in
* cuda/samples/6_Advanced/reduction/doc/reduction.pdf */
template <unsigned int blockSize>
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 2 //
/////////////////////////
/* reduction kernel 1 executed
* with atomic warp-synchronous addition */
template <unsigned int blockSize>
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 3 //
/////////////////////////
template <unsigned int blockSize>
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
atomicAdd(&sdata[tid], sdata[tid + 256]);
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
atomicAdd(&sdata[tid], sdata[tid + 128]);
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
atomicAdd(&sdata[tid], sdata[tid + 64]);
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 4 //
/////////////////////////
template <unsigned int blockSize>
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
atomicAdd(&sdata[tid], sdata[tid + 256]);
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
atomicAdd(&sdata[tid], sdata[tid + 128]);
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
atomicAdd(&sdata[tid], sdata[tid + 64]);
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
答案 0 :(得分:2)
在您的代码中,您没有使用正确的CUDA error checking进行内核调用。由于时间都是一样的,我强烈怀疑你的内核并没有真正启动。我已经在我自己的CUDA减少设置上验证了当减少元素的数量为1<<24
时,实现了相同的时序。上面的CUDA错误检查会返回无效的配置参数。
我有机会提到您的atomicWarpReduce
__device__
函数实际上是错误的,因为它缺少正确的同步(另请参见主题Removing __syncthreads() in CUDA warp-level reduction)。正确的版本是
template <class T>
__device__ void atomicWarpReduce(T *sdata, int tid) {
atomicAdd(&sdata[tid], sdata[tid + 32]); __syncthreads();
atomicAdd(&sdata[tid], sdata[tid + 16]); __syncthreads();
atomicAdd(&sdata[tid], sdata[tid + 8]); __syncthreads();
atomicAdd(&sdata[tid], sdata[tid + 4]); __syncthreads();
atomicAdd(&sdata[tid], sdata[tid + 2]); __syncthreads();
atomicAdd(&sdata[tid], sdata[tid + 1]); __syncthreads();
}
当然,在这种情况下你不需要原子,我理解它只是为了理解。但是,原子不会强制执行同步,只是通过对共享内存数组sdata
进行顺序访问来避免竞争条件(无论如何都不存在)。您可能希望比较
你的版本
Function : _Z18reduce4_atomicWarpIiEvPT_S1_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ SHL R3, R0, 0x1; /* 0x6000c0000400dc03 */
/*0018*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0020*/ IMAD R3, R3, c[0x0][0x8], R2; /* 0x200440002030dca3 */
/*0028*/ IADD R4, R3, c[0x0][0x8]; /* 0x4800400020311c03 */
/*0030*/ ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */
/*0038*/ ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */
/*0040*/ @P0 ISCADD R3, R3, c[0x0][0x20], 0x2; /* 0x400040008030c043 */
/*0048*/ @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2; /* 0x4000400080412443 */
/*0050*/ @!P0 MOV R5, RZ; /* 0x28000000fc0161e4 */
/*0058*/ @!P1 LD R4, [R4]; /* 0x8000000000412485 */
/*0060*/ @P0 LD R5, [R3]; /* 0x8000000000314085 */
/*0068*/ SHL R3, R2, 0x2; /* 0x6000c0000820dc03 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ @!P1 IADD R5, R4, R5; /* 0x4800000014416403 */
/*0080*/ MOV R4, c[0x0][0x8]; /* 0x2800400020011de4 */
/*0088*/ STS [R3], R5; /* 0xc900000000315c85 */
/*0090*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0098*/ MOV R6, c[0x0][0x8]; /* 0x2800400020019de4 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R6, 0x42, PT; /* 0x188ec0010861dc03 */
/*00a8*/ @P0 BRA 0x118; /* 0x40000001a00001e7 */
/*00b0*/ NOP; /* 0x4000000000001de4 */
/*00b8*/ NOP; /* 0x4000000000001de4 */
/*00c0*/ MOV R6, R4; /* 0x2800000010019de4 */
/*00c8*/ SHR.U32 R4, R4, 0x1; /* 0x5800c00004411c03 */
/*00d0*/ ISETP.GE.U32.AND P0, PT, R2, R4, PT; /* 0x1b0e00001021dc03 */
/*00d8*/ @!P0 IADD R7, R4, R2; /* 0x480000000841e003 */
/*00e0*/ @!P0 SHL R7, R7, 0x2; /* 0x6000c0000871e003 */
/*00e8*/ @!P0 LDS R7, [R7]; /* 0xc10000000071e085 */
/*00f0*/ @!P0 IADD R5, R7, R5; /* 0x4800000014716003 */
/*00f8*/ @!P0 STS [R3], R5; /* 0xc900000000316085 */
/*0100*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0108*/ ISETP.GT.U32.AND P0, PT, R6, 0x83, PT; /* 0x1a0ec0020c61dc03 */
/*0110*/ @P0 BRA 0xc0; /* 0x4003fffea00001e7 */
/*0118*/ ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT; /* 0x1a0ec0007c21dc03 */
/*0120*/ SSY 0x2a8; /* 0x6000000600000007 */
/*0128*/ @P0 BRA 0x2a0; /* 0x40000005c00001e7 */
/*0130*/ LDS R4, [R3+0x80]; R4 = sdata[tid + 32]
/*0138*/ SSY 0x168;
/*0140*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0148*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0150*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*0158*/ @!P0 BRA 0x140; /* 0x4003ffff800021e7 */
/*0160*/ NOP.S; /* 0x4000000000001df4 */
/*0168*/ LDS R4, [R3+0x40]; R4 = sdata[tid + 16]
/*0170*/ SSY 0x1a8;
/*0178*/ NOP; /* 0x4000000000001de4 */
/*0180*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0188*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0190*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*0198*/ @!P0 BRA 0x180; /* 0x4003ffff800021e7 */
/*01a0*/ NOP.S; /* 0x4000000000001df4 */
/*01a8*/ LDS R4, [R3+0x20]; R4 = sdata[tid + 8]
/*01b0*/ SSY 0x1e8;
/*01b8*/ NOP; /* 0x4000000000001de4 */
/*01c0*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*01c8*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*01d0*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*01d8*/ @!P0 BRA 0x1c0; /* 0x4003ffff800021e7 */
/*01e0*/ NOP.S; /* 0x4000000000001df4 */
/*01e8*/ LDS R6, [R3+0x10]; /* 0xc100000040319c85 */
/*01f0*/ LDS R5, [R3+0x8]; /* 0xc100000020315c85 */
/*01f8*/ LDS R4, [R3+0x4]; /* 0xc100000010311c85 */
/*0200*/ SSY 0x230; /* 0x60000000a0000007 */
/*0208*/ LDSLK P0, R7, [R3]; /* 0xc40000000031dc85 */
/*0210*/ @P0 IADD R7, R7, R6; /* 0x480000001871c003 */
/*0218*/ @P0 STSUL [R3], R7; /* 0xcc0000000031c085 */
/*0220*/ @!P0 BRA 0x208; /* 0x4003ffff800021e7 */
/*0228*/ NOP.S; /* 0x4000000000001df4 */
/*0230*/ SSY 0x268; /* 0x60000000c0000007 */
/*0238*/ NOP; /* 0x4000000000001de4 */
/*0240*/ LDSLK P0, R6, [R3]; /* 0xc400000000319c85 */
/*0248*/ @P0 IADD R6, R6, R5; /* 0x4800000014618003 */
/*0250*/ @P0 STSUL [R3], R6; /* 0xcc00000000318085 */
/*0258*/ @!P0 BRA 0x240; /* 0x4003ffff800021e7 */
/*0260*/ NOP.S; /* 0x4000000000001df4 */
/*0268*/ NOP; /* 0x4000000000001de4 */
/*0270*/ NOP; /* 0x4000000000001de4 */
/*0278*/ NOP; /* 0x4000000000001de4 */
/*0280*/ LDSLK P0, R5, [R3]; /* 0xc400000000315c85 */
/*0288*/ @P0 IADD R5, R5, R4; /* 0x4800000010514003 */
/*0290*/ @P0 STSUL [R3], R5; /* 0xcc00000000314085 */
/*0298*/ @!P0 BRA 0x280; /* 0x4003ffff800021e7 */
/*02a0*/ ISETP.NE.AND.S P0, PT, R2, RZ, PT; /* 0x1a8e0000fc21dc33 */
/*02a8*/ @P0 BRA.U 0x2c8; /* 0x40000000600081e7 */
/*02b0*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */
/*02b8*/ @!P0 LDS R2, [RZ]; /* 0xc100000003f0a085 */
/*02c0*/ @!P0 ST [R0], R2; /* 0x900000000000a085 */
/*02c8*/ EXIT; /* 0x8000000000001de7 */
和
正确版本
Function : _Z18reduce4_atomicWarpIiEvPT_S1_j
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ SHL R3, R0, 0x1; /* 0x6000c0000400dc03 */
/*0018*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0020*/ IMAD R3, R3, c[0x0][0x8], R2; /* 0x200440002030dca3 */
/*0028*/ IADD R4, R3, c[0x0][0x8]; /* 0x4800400020311c03 */
/*0030*/ ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */
/*0038*/ ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */
/*0040*/ @P0 ISCADD R3, R3, c[0x0][0x20], 0x2; /* 0x400040008030c043 */
/*0048*/ @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2; /* 0x4000400080412443 */
/*0050*/ @!P0 MOV R5, RZ; /* 0x28000000fc0161e4 */
/*0058*/ @!P1 LD R4, [R4]; /* 0x8000000000412485 */
/*0060*/ @P0 LD R5, [R3]; /* 0x8000000000314085 */
/*0068*/ SHL R3, R2, 0x2; /* 0x6000c0000820dc03 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ @!P1 IADD R5, R4, R5; /* 0x4800000014416403 */
/*0080*/ MOV R4, c[0x0][0x8]; /* 0x2800400020011de4 */
/*0088*/ STS [R3], R5; /* 0xc900000000315c85 */
/*0090*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0098*/ MOV R6, c[0x0][0x8]; /* 0x2800400020019de4 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R6, 0x42, PT; /* 0x188ec0010861dc03 */
/*00a8*/ @P0 BRA 0x118; /* 0x40000001a00001e7 */
/*00b0*/ NOP; /* 0x4000000000001de4 */
/*00b8*/ NOP; /* 0x4000000000001de4 */
/*00c0*/ MOV R6, R4; /* 0x2800000010019de4 */
/*00c8*/ SHR.U32 R4, R4, 0x1; /* 0x5800c00004411c03 */
/*00d0*/ ISETP.GE.U32.AND P0, PT, R2, R4, PT; /* 0x1b0e00001021dc03 */
/*00d8*/ @!P0 IADD R7, R4, R2; /* 0x480000000841e003 */
/*00e0*/ @!P0 SHL R7, R7, 0x2; /* 0x6000c0000871e003 */
/*00e8*/ @!P0 LDS R7, [R7]; /* 0xc10000000071e085 */
/*00f0*/ @!P0 IADD R5, R7, R5; /* 0x4800000014716003 */
/*00f8*/ @!P0 STS [R3], R5; /* 0xc900000000316085 */
/*0100*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreds()
/*0108*/ ISETP.GT.U32.AND P0, PT, R6, 0x83, PT; /* 0x1a0ec0020c61dc03 */
/*0110*/ @P0 BRA 0xc0; /* 0x4003fffea00001e7 */
/*0118*/ ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;
/*0120*/ SSY 0x2b8;
/*0128*/ @P0 BRA 0x2b0; /* 0x40000006000001e7 */
/*0130*/ LDS R4, [R3+0x80]; R4 = sdata[tid + 32]
/*0138*/ SSY 0x168;
/*0140*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0148*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0150*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*0158*/ @!P0 BRA 0x140; /* 0x4003ffff800021e7 */
/*0160*/ NOP.S; /* 0x4000000000001df4 */
/*0168*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*0170*/ LDS R4, [R3+0x40]; R4 = sdata[tid + 16]
/*0178*/ SSY 0x1a8;
/*0180*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0188*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0190*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*0198*/ @!P0 BRA 0x180; /* 0x4003ffff800021e7 */
/*01a0*/ NOP.S; /* 0x4000000000001df4 */
/*01a8*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*01b0*/ LDS R4, [R3+0x20]; R4 = sdata[tid + 8]
/*01b8*/ SSY 0x1e8;
/*01c0*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*01c8*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*01d0*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*01d8*/ @!P0 BRA 0x1c0; /* 0x4003ffff800021e7 */
/*01e0*/ NOP.S; /* 0x4000000000001df4 */
/*01e8*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*01f0*/ LDS R4, [R3+0x10]; R4 = sdata[tid + 4]
/*01f8*/ SSY 0x228;
/*0200*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0208*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0210*/ @P0 STSUL [R3], R5; R5 = R5 + R4
/*0218*/ @!P0 BRA 0x200; /* 0x4003ffff800021e7 */
/*0220*/ NOP.S; /* 0x4000000000001df4 */
/*0228*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*0230*/ LDS R4, [R3+0x8]; R4 = sdata[tid + 2]
/*0238*/ SSY 0x268;
/*0240*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0248*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0250*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*0258*/ @!P0 BRA 0x240; /* 0x4003ffff800021e7 */
/*0260*/ NOP.S; /* 0x4000000000001df4 */
/*0268*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*0270*/ LDS R4, [R3+0x4]; R4 = sdata[tid + 1]
/*0278*/ SSY 0x2a8;
/*0280*/ LDSLK P0, R5, [R3]; R5 = sdata[tid] (load from shared memory and lock)
/*0288*/ @P0 IADD R5, R5, R4; R5 = R5 + R4
/*0290*/ @P0 STSUL [R3], R5; sdata[tid] = R5 (store to shared memory and unlock)
/*0298*/ @!P0 BRA 0x280; /* 0x4003ffff800021e7 */
/*02a0*/ NOP.S; /* 0x4000000000001df4 */
/*02a8*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*02b0*/ ISETP.NE.AND.S P0, PT, R2, RZ, PT; /* 0x1a8e0000fc21dc33 */
/*02b8*/ @P0 BRA.U 0x2d8; /* 0x40000000600081e7 */
/*02c0*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */
/*02c8*/ @!P0 LDS R2, [RZ]; /* 0xc100000003f0a085 */
/*02d0*/ @!P0 ST [R0], R2; /* 0x900000000000a085 */
/*02d8*/ EXIT; /* 0x8000000000001de7 */
回到真正的问题,通过确保正确启动内核,您可以轻松验证原子是否对性能产生影响。