我正在尝试理解如何使用__threadfence()
,因为它看起来像一个强大的同步原语,它允许不同的块一起工作,而不必经历结束内核和启动新内核的巨大麻烦。 CUDA C编程指南有一个例子(附录B.5),它在SDK中的“threadFenceReduction”示例中充实,所以它似乎是我们“应该”使用的东西。
然而,当我尝试使用__threadfence()
时,它的速度非常慢。有关示例,请参阅下面的代码。据我所知,__threadfence()
应确保在继续之前完成当前线程块的所有挂起内存传输。我相信,内存延迟稍微好于一微秒,因此在GTX680上处理所包含代码中64KB内存传输的总时间应该在一微秒左右。相反,__threadfence()
指令似乎需要20
微秒!而不是使用__threadfence()
进行同步,我可以在不到三分之一的时间内结束内核,并启动一个全新的内核(在相同的默认流中,以便它被同步)! p>
这里发生了什么?我的代码中是否有一个我没有注意到的错误?或者__threadfence()
实际上20x
是否比它应该慢,6x
比整个内核启动+清理慢?
1000次运行threadfence内核的时间:27.716831 ms
答案:120
只运行前3行1000次的时间,包括threadfence:25.962912 ms
通过拆分为两个内核,无需threadfence进行同步:7.653344 ms
答案:120
#include "cuda.h"
#include <cstdio>
__device__ unsigned int count = 0;
__shared__ bool isLastBlockDone;
__device__ int scratch[16];
__device__ int junk[16000];
__device__ int answer;
__global__ void usethreadfence() //just like the code example in B.5 of the CUDA C Programming Guide
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x; //do some more memory writes to make the kernel nontrivial
__threadfence();
if (threadIdx.x==0) {
unsigned int value = atomicInc(&count, gridDim.x);
isLastBlockDone = (value == (gridDim.x - 1));
}
__syncthreads();
if (isLastBlockDone && threadIdx.x==0) {
// The last block sums the results stored in scratch[0 .. gridDim.x-1]
int sum=0;
for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
answer=sum;
}
}
__global__ void justthreadfence() //first three lines of the previous kernel, so we can compare speeds
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
__threadfence();
}
__global__ void usetwokernels_1() //this and the next kernel reproduce the functionality of the first kernel, but faster!
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
}
__global__ void usetwokernels_2()
{
if (threadIdx.x==0) {
int sum=0;
for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
answer=sum;
}
}
int main() {
int sum;
cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) usethreadfence<<<16,1000>>>();
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of the threadfence kernel: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) justthreadfence<<<16,1000>>>();
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of just the first 3 lines, including threadfence: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) {usetwokernels_1<<<16,1000>>>(); usetwokernels_2<<<16,1000>>>();}
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Synchronizing without threadfence, by splitting to two kernels: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
}
答案 0 :(得分:1)
我已经在两张不同的卡上测试了使用CUDA 6.0编译的代码:GT540M(Fermi)和Kepler K20c(开普勒),这些都是结果
<强> GT540M 强>
Time for 1000 runs of the threadfence kernel: 303.373688 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 300.395416 ms
Synchronizing without threadfence, by splitting to two kernels: 597.729919 ms
Answer: 120
Kepler K20c
Time for 1000 runs of the threadfence kernel: 10.164096 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 8.808896 ms
Synchronizing without threadfence, by splitting to two kernels: 17.330784 ms
Answer: 120
我没有发现__threadfence()
对其他两个被考虑的案件有任何特别缓慢的行为。
这可以通过使用反汇编代码来证明。
<强> usethreadfence()
强>
c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ S2R R0, SR_TID.X; R0 = threadIdx.x
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; P0 = (R0 != 0)
/*0018*/ S2R R5, SR_CTAID.X; R5 = blockIdx.x
/*0020*/ IMAD R3, R5, 0x3e8, R0; R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
if (threadIdx.x == 0)
/*0028*/ @!P0 ISCADD R2, R5, c[0xe][0x0], 0x2; R2 = scratch + threadIdx.x
/*0030*/ IADD R4, R0, 0x11; R4 = R0 + 17 = threadIdx.x + 17
/*0038*/ ISCADD R3, R3, c[0xe][0x4], 0x2; R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/ @!P0 ST [R2], R5; scratch[threadIdx.x] = blockIdx.x
/*0048*/ ST [R3], R4; junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/ MEMBAR.GL; __threadfence
/*0058*/ @P0 BRA.U 0x98; if (threadIdx.x != 0) branch to 0x98
if (threadIdx.x == 0)
/*0060*/ @!P0 MOV R2, c[0xe][0xc]; R2 = &count
/*0068*/ @!P0 MOV R3, c[0x0][0x14]; R3 = gridDim.x
/*0070*/ @!P0 ATOM.INC R2, [R2], R3; R2 = value = count + 1; *(&count) ++
/*0078*/ @!P0 IADD R3, R3, -0x1; R3 = R3 - 1 = gridDim.x - 1
/*0080*/ @!P0 ISETP.EQ.AND P1, PT, R2, R3, PT; P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/ @!P0 SEL R2, RZ, 0x1, !P1; if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/ @!P0 STS.U8 [RZ], R2; Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*00a8*/ LDS.U8 R0, [RZ]; R0 = R2 = isLastBlockDone
/*00b0*/ ISETP.NE.AND P0, PT, R0, RZ, P0; P0 = (R0 == 0)
/*00b8*/ @!P0 EXIT; if (isLastBlockDone != 0) exits
/*00c0*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT; IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/ MOV R0, RZ;
/*00d0*/ @!P0 BRA 0x1b8;
/*00d8*/ MOV R2, c[0x0][0x14];
/*00e0*/ ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/ MOV R2, RZ;
/*00f0*/ @!P0 BRA 0x170;
/*00f8*/ MOV R3, c[0x0][0x14];
/*0100*/ IADD R7, R3, -0x3;
/*0108*/ NOP;
/*0110*/ ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/ IADD R2, R2, 0x4;
/*0120*/ LD R4, [R3];
/*0128*/ ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/ LD R5, [R3+0x4];
/*0138*/ LD R6, [R3+0x8];
/*0140*/ LD R3, [R3+0xc];
/*0148*/ IADD R0, R4, R0;
/*0150*/ IADD R0, R5, R0;
/*0158*/ IADD R0, R6, R0;
/*0160*/ IADD R0, R3, R0;
/*0168*/ @P0 BRA 0x110;
/*0170*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/ @!P0 BRA 0x1b8;
/*0180*/ ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/ IADD R2, R2, 0x1;
/*0190*/ LD R3, [R3];
/*0198*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/ NOP;
/*01a8*/ IADD R0, R3, R0;
/*01b0*/ @P0 BRA 0x180;
/*01b8*/ MOV R2, c[0xe][0x8];
/*01c0*/ ST [R2], R0;
/*01c8*/ EXIT;
<强> justthreadfence()
强>
Function : _Z15justthreadfencev
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0010*/ ISETP.NE.AND P0, PT, R3, RZ, PT; /* 0x1a8e0000fc31dc23 */
/*0018*/ S2R R4, SR_CTAID.X; /* 0x2c00000094011c04 */
/*0020*/ IMAD R2, R4, 0x3e8, R3; /* 0x2006c00fa0409ca3 */
/*0028*/ @!P0 ISCADD R0, R4, c[0xe][0x0], 0x2; /* 0x4000780000402043 */
/*0030*/ IADD R3, R3, 0x11; /* 0x4800c0004430dc03 */
/*0038*/ ISCADD R2, R2, c[0xe][0x4], 0x2; /* 0x4000780010209c43 */
/*0040*/ @!P0 ST [R0], R4; /* 0x9000000000012085 */
/*0048*/ ST [R2], R3; /* 0x900000000020dc85 */
/*0050*/ MEMBAR.GL; /* 0xe000000000001c25 */
/*0058*/ EXIT; /* 0x8000000000001de7 */
<强> usetwokernels_1()
强>
Function : _Z15usetwokernels_1v
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0018*/ S2R R2, SR_CTAID.X; /* 0x2c00000094009c04 */
/*0020*/ IMAD R4, R2, 0x3e8, R0; /* 0x2000c00fa0211ca3 */
/*0028*/ @!P0 ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020e043 */
/*0030*/ IADD R0, R0, 0x11; /* 0x4800c00044001c03 */
/*0038*/ ISCADD R4, R4, c[0xe][0x4], 0x2; /* 0x4000780010411c43 */
/*0040*/ @!P0 ST [R3], R2; /* 0x900000000030a085 */
/*0048*/ ST [R4], R0; /* 0x9000000000401c85 */
/*0050*/ EXIT; /* 0x8000000000001de7 */
.....................................
<强> usetwokernels_1()
强>
Function : _Z15usetwokernels_2v
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0018*/ @P0 EXIT; /* 0x80000000000001e7 */
/*0020*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT; /* 0x1a8e400053f1dc23 */
/*0028*/ MOV R0, RZ; /* 0x28000000fc001de4 */
/*0030*/ @!P0 BRA 0x130; /* 0x40000003e00021e7 */
/*0038*/ MOV R2, c[0x0][0x14]; /* 0x2800400050009de4 */
/*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT; /* 0x1a0ec0000c21dc23 */
/*0048*/ MOV R2, RZ; /* 0x28000000fc009de4 */
/*0050*/ @!P0 BRA 0xe0; /* 0x40000002200021e7 */
/*0058*/ MOV R3, c[0x0][0x14]; /* 0x280040005000dde4 */
/*0060*/ IADD R7, R3, -0x3; /* 0x4800fffff431dc03 */
/*0068*/ NOP; /* 0x4000000000001de4 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ NOP; /* 0x4000000000001de4 */
/*0080*/ ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */
/*0088*/ LD R4, [R3]; /* 0x8000000000311c85 */
/*0090*/ IADD R2, R2, 0x4; /* 0x4800c00010209c03 */
/*0098*/ LD R5, [R3+0x4]; /* 0x8000000010315c85 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R2, R7, PT; /* 0x188e00001c21dc03 */
/*00a8*/ LD R6, [R3+0x8]; /* 0x8000000020319c85 */
/*00b0*/ LD R3, [R3+0xc]; /* 0x800000003030dc85 */
/*00b8*/ IADD R0, R4, R0; /* 0x4800000000401c03 */
/*00c0*/ IADD R0, R5, R0; /* 0x4800000000501c03 */
/*00c8*/ IADD R0, R6, R0; /* 0x4800000000601c03 */
/*00d0*/ IADD R0, R3, R0; /* 0x4800000000301c03 */
/*00d8*/ @P0 BRA 0x80; /* 0x4003fffe800001e7 */
/*00e0*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT; /* 0x188e40005021dc03 */
/*00e8*/ @!P0 BRA 0x130; /* 0x40000001000021e7 */
/*00f0*/ NOP; /* 0x4000000000001de4 */
/*00f8*/ NOP; /* 0x4000000000001de4 */
/*0100*/ ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */
/*0108*/ IADD R2, R2, 0x1; /* 0x4800c00004209c03 */
/*0110*/ LD R3, [R3]; /* 0x800000000030dc85 */
/*0118*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT; /* 0x188e40005021dc03 */
/*0120*/ IADD R0, R3, R0; /* 0x4800000000301c03 */
/*0128*/ @P0 BRA 0x100; /* 0x4003ffff400001e7 */
/*0130*/ MOV R2, c[0xe][0x8]; /* 0x2800780020009de4 */
/*0138*/ ST [R2], R0; /* 0x9000000000201c85 */
/*0140*/ EXIT; /* 0x8000000000001de7 */
.....................................
可以看出,justthreadfencev()
的说明严格包含在usethreadfence()
的说明中,而usetwokernels_1()
和usetwokernels_2()
的说明实际上是justthreadfencev()
和{{1}}的说明。 {{1}}。因此,时序的差异可能归因于第二个内核的内核启动开销。