有效地将无符号值除以2的幂,四舍五入 - 在CUDA中

时间:2017-04-22 21:17:59

标签: optimization cuda rounding gpgpu integer-division

我刚读完:

Efficiently dividing unsigned value by a power of two, rounding up

我想知道在CUDA中最快的方法是什么。当然是"快速"我的意思是在吞吐量方面(该问题也解决了后续呼叫相互依赖的情况)。

对于该问题中提到的lg()函数(除数的基数为2的对数),假设我们有:

template <typename T> __device__ int find_first_set(T x);
template <> __device__ int find_first_set<uint32_t>(uint32_t x) { return __ffs(x);   }
template <> __device__ int find_first_set<uint64_t>(uint64_t x) { return __ffsll(x); }

template <typename T> __device__ int lg(T x) { return find_first_set(x) - 1; }

修改:由于我已经意识到PTX中没有找到第一个sert,也没有在此时所有nVIDIA GPU的指令集中,让我们用以下内容替换lg()

template <typename T> __df__ int population_count(T x);
template <> int population_count<uint32_t>(uint32_t x) { return __popc(x);   }
template <> int population_count<uint64_t>(uint64_t x) { return __popcll(x); }

template <typename T>
__device__ int lg_for_power_of_2(T x) { return population_count(x - 1); }

我们现在需要实施

template <typename T> T div_by_power_of_2_rounding_up(T p, T q);

...代表T = uint32_tT = uint64_t。 (p是红利,q是除数。)

备注:

  • 与原始问题一样,我们可能假设p <= std::numeric_limits<T>::max() - qp > 0 - 会破坏各种有趣的替代方案: - )
  • 0不是2的幂,所以我们可以假设q != 0
  • 我意识到32位和64位的解决方案可能会有所不同;我对前者更感兴趣,但后者也感兴趣。
  • 让我们关注Maxwell和Pascal芯片。

6 个答案:

答案 0 :(得分:3)

通过漏斗移位,可能的32位策略是进行33位移位(基本上)保留加法的进位,以便在移位之前完成,例如:(未测试)

unsigned sum = dividend + mask;
unsigned result = __funnelshift_r(sum, sum < mask, log_2_of_divisor);

由@einpoklum编辑

使用@ RobertCrovella的程序测试,似乎工作正常。 SM_61的测试内核PTX是:

    .reg .pred      %p<2>;
    .reg .b32       %r<12>;


    ld.param.u32    %r5, [_Z4testjj_param_0];
    ld.param.u32    %r6, [_Z4testjj_param_1];
    neg.s32         %r7, %r6;
    and.b32         %r8, %r6, %r7;
    clz.b32         %r9, %r8;
    mov.u32         %r10, 31;
    sub.s32         %r4, %r10, %r9;
    add.s32         %r11, %r6, -1;
    add.s32         %r2, %r11, %r5;
    setp.lt.u32     %p1, %r2, %r11;
    selp.u32        %r3, 1, 0, %p1;
    // inline asm
    shf.r.wrap.b32 %r1, %r2, %r3, %r4;
    // inline asm
    st.global.u32   [r], %r1;
    ret;

并且SASS是:

/*0008*/                   MOV R1, c[0x0][0x20];                 /* 0x4c98078000870001 */
/*0010*/                   MOV R0, c[0x0][0x144];                /* 0x4c98078005170000 */
/*0018*/                   IADD R2, RZ, -c[0x0][0x144];          /* 0x4c1100000517ff02 */
                                                                 /* 0x001c4c00fe4007f1 */
/*0028*/                   IADD32I R0, R0, -0x1;                 /* 0x1c0ffffffff70000 */
/*0030*/                   LOP.AND R2, R2, c[0x0][0x144];        /* 0x4c47000005170202 */
/*0038*/                   FLO.U32 R2, R2;                       /* 0x5c30000000270002 */
                                                                 /* 0x003fd800fe2007e6 */
/*0048*/                   IADD R5, R0, c[0x0][0x140];           /* 0x4c10000005070005 */
/*0050*/                   ISETP.LT.U32.AND P0, PT, R5, R0, PT;  /* 0x5b62038000070507 */
/*0058*/                   IADD32I R0, -R2, 0x1f;                /* 0x1d00000001f70200 */
                                                                 /* 0x001fc400fe2007f6 */
/*0068*/                   IADD32I R0, -R0, 0x1f;                /* 0x1d00000001f70000 */
/*0070*/                   SEL R6, RZ, 0x1, !P0;                 /* 0x38a004000017ff06 */
/*0078*/                   MOV32I R2, 0x0;                       /* 0x010000000007f002 */
                                                                 /* 0x0003c400fe4007e4 */
/*0088*/                   MOV32I R3, 0x0;                       /* 0x010000000007f003 */
/*0090*/                   SHF.R.W R0, R5, R0, R6;               /* 0x5cfc030000070500 */
/*0098*/                   STG.E [R2], R0;                       /* 0xeedc200000070200 */
                                                                 /* 0x001f8000ffe007ff */
/*00a8*/                   EXIT;                                 /* 0xe30000000007000f */
/*00b0*/                   BRA 0xb0;                             /* 0xe2400fffff87000f */
/*00b8*/                   NOP;                                  /* 0x50b0000000070f00 */

答案 1 :(得分:2)

这里是a well-performing answer对CPU的改编:

template <typename T>
__device__ T div_by_power_of_2_rounding_up(T dividend, T divisor)
{
    auto log_2_of_divisor = lg(divisor);
    auto mask = divisor - 1;
    auto correction_for_rounding_up = ((dividend & mask) + mask) >> log_2_of_divisor;

    return (dividend >> log_2_of_divisor) + correction_for_rounding_up;
}

我想知道是否可以做得更好。

SM_61的SASS代码(使用@ RobertCrovella&#39的测试内核)是:

code for sm_61
        Function : test(unsigned int, unsigned int)
.headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                           /* 0x001fd400fe2007f6 */
/*0008*/                   MOV R1, c[0x0][0x20];           /* 0x4c98078000870001 */
/*0010*/                   IADD R0, RZ, -c[0x0][0x144];    /* 0x4c1100000517ff00 */
/*0018*/                   MOV R2, c[0x0][0x144];          /* 0x4c98078005170002 */
                                                           /* 0x003fc40007a007f2 */
/*0028*/                   LOP.AND R0, R0, c[0x0][0x144];  /* 0x4c47000005170000 */
/*0030*/                   FLO.U32 R3, R0;                 /* 0x5c30000000070003 */
/*0038*/                   IADD32I R0, R2, -0x1;           /* 0x1c0ffffffff70200 */
                                                           /* 0x001fc400fcc017f5 */
/*0048*/                   IADD32I R3, -R3, 0x1f;          /* 0x1d00000001f70303 */
/*0050*/                   LOP.AND R2, R0, c[0x0][0x140];  /* 0x4c47000005070002 */
/*0058*/                   IADD R2, R0, R2;                /* 0x5c10000000270002 */
                                                           /* 0x001fd000fe2007f1 */
/*0068*/                   IADD32I R0, -R3, 0x1f;          /* 0x1d00000001f70300 */
/*0070*/                   MOV R3, c[0x0][0x140];          /* 0x4c98078005070003 */
/*0078*/                   MOV32I R6, 0x0;                 /* 0x010000000007f006 */
                                                           /* 0x001fc400fc2407f1 */
/*0088*/                   SHR.U32 R4, R2, R0.reuse;       /* 0x5c28000000070204 */
/*0090*/                   SHR.U32 R5, R3, R0;             /* 0x5c28000000070305 */
/*0098*/                   MOV R2, R6;                     /* 0x5c98078000670002 */
                                                           /* 0x0003c400fe4007f4 */
/*00a8*/                   MOV32I R3, 0x0;                 /* 0x010000000007f003 */
/*00b0*/                   IADD R0, R4, R5;                /* 0x5c10000000570400 */
/*00b8*/                   STG.E [R2], R0;                 /* 0xeedc200000070200 */
                                                           /* 0x001f8000ffe007ff */
/*00c8*/                   EXIT;                           /* 0xe30000000007000f */
/*00d0*/                   BRA 0xd0;                       /* 0xe2400fffff87000f */
/*00d8*/                   NOP;                            /* 0x50b0000000070f00 */
                                                           /* 0x001f8000fc0007e0 */
/*00e8*/                   NOP;                            /* 0x50b0000000070f00 */
/*00f0*/                   NOP;                            /* 0x50b0000000070f00 */
/*00f8*/                   NOP;                            /* 0x50b0000000070f00 */

FLO是&#34;找到前导1&#34;指导(谢谢@tera)。无论如何,这些都是很多指令,即使你忽略了(看起来像)常量内存的负载...... CPU功能只能启发这个compiles into

    tzcnt   rax, rsi
    lea     rcx, [rdi - 1]
    shrx    rax, rcx, rax
    add     rax, 1
    test    rdi, rdi
    cmove   rax, rdi

(与clang 3.9.0)。

答案 2 :(得分:2)

template <typename T> __device__ T div_by_power_of_2_rounding_up(T p, T q)
{
    return p==0 ? 0 : ((p - 1) >> lg(q)) + 1;
}

一条指令短于Robert's previous answer(但请参阅他的comeback),如果我的计数正确,或与漏斗转移指令相同。虽然有一个分支 - 不确定这是否有所作为(除了整个扭曲变为零p输入的好处之外):

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_61
        Function : _Z4testjj
    .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                                         /* 0x001fc000fda007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                                 /* 0x4c98078000870001 */
        /*0010*/                   ISETP.EQ.AND P0, PT, RZ, c[0x0][0x140], PT;           /* 0x4b6503800507ff07 */
        /*0018*/         {         MOV R0, RZ;                                           /* 0x5c9807800ff70000 */
        /*0028*/               @P0 BRA 0x90;        }                                    /* 0x001fc800fec007fd */
                                                                                         /* 0xe24000000600000f */
        /*0030*/                   IADD R0, RZ, -c[0x0][0x144];                          /* 0x4c1100000517ff00 */
        /*0038*/                   LOP.AND R0, R0, c[0x0][0x144];                        /* 0x4c47000005170000 */
                                                                                         /* 0x003fc400ffa00711 */
        /*0048*/                   FLO.U32 R0, R0;                                       /* 0x5c30000000070000 */
        /*0050*/                   MOV R3, c[0x0][0x140];                                /* 0x4c98078005070003 */
        /*0058*/                   IADD32I R2, -R0, 0x1f;                                /* 0x1d00000001f70002 */
                                                                                         /* 0x001fd800fcc007f5 */
        /*0068*/                   IADD32I R0, R3, -0x1;                                 /* 0x1c0ffffffff70300 */
        /*0070*/                   IADD32I R2, -R2, 0x1f;                                /* 0x1d00000001f70202 */
        /*0078*/                   SHR.U32 R0, R0, R2;                                   /* 0x5c28000000270000 */
                                                                                         /* 0x001fc800fe2007f6 */
        /*0088*/                   IADD32I R0, R0, 0x1;                                  /* 0x1c00000000170000 */
        /*0090*/                   MOV32I R2, 0x0;                                       /* 0x010000000007f002 */
        /*0098*/                   MOV32I R3, 0x0;                                       /* 0x010000000007f003 */
                                                                                         /* 0x001ffc00ffe000f1 */
        /*00a8*/                   STG.E [R2], R0;                                       /* 0xeedc200000070200 */
        /*00b0*/                   EXIT;                                                 /* 0xe30000000007000f */
        /*00b8*/                   BRA 0xb8;                                             /* 0xe2400fffff87000f */
        ..........................

我认为通过在PTX中写一个或两个指令仍然是可能的(早上更新:同时为Robert has proven),但我真的需要上床睡觉。

Update2 :这样做(使用Harold's funnel shift并在PTX中编写该功能)

_device__ uint32_t div_by_power_of_2_rounding_up(uint32_t p, uint32_t q)
{
  uint32_t ret;
  asm volatile("{\r\t"
               ".reg.u32        shift, mask, lo, hi;\n\t"
               "bfind.u32       shift, %2;\r\t"
               "sub.u32         mask, %2, 1;\r\t"
               "add.cc.u32      lo, %1, mask;\r\t"
               "addc.u32        hi, 0, 0;\r\t"
               "shf.r.wrap.b32  %0, lo, hi, shift;\n\t"
               "}"
                : "=r"(ret) : "r"(p), "r"(q));
  return ret;
}

让我们得到与罗伯特用他更简单的C代码实现的相同的指令数:

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

    code for sm_61
        Function : _Z4testjj
    .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                            /* 0x001fc000fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                    /* 0x4c98078000870001 */
        /*0010*/                   MOV R0, c[0x0][0x144];                   /* 0x4c98078005170000 */
        /*0018*/         {         IADD32I R2, R0, -0x1;                    /* 0x1c0ffffffff70002 */
        /*0028*/                   FLO.U32 R0, c[0x0][0x144];        }      /* 0x001fc400fec00716 */
                                                                            /* 0x4c30000005170000 */
        /*0030*/                   IADD R5.CC, R2, c[0x0][0x140];           /* 0x4c10800005070205 */
        /*0038*/                   IADD.X R6, RZ, RZ;                       /* 0x5c1008000ff7ff06 */
                                                                            /* 0x003fc800fc8007f1 */
        /*0048*/                   MOV32I R2, 0x0;                          /* 0x010000000007f002 */
        /*0050*/                   MOV32I R3, 0x0;                          /* 0x010000000007f003 */
        /*0058*/                   SHF.R.W R0, R5, R0, R6;                  /* 0x5cfc030000070500 */
                                                                            /* 0x001ffc00ffe000f1 */
        /*0068*/                   STG.E [R2], R0;                          /* 0xeedc200000070200 */
        /*0070*/                   EXIT;                                    /* 0xe30000000007000f */
        /*0078*/                   BRA 0x78;                                /* 0xe2400fffff87000f */
        ..........................

答案 3 :(得分:2)

通过@tera重温the kewl answer

 template <typename T> __device__ T pdqru(T p, T q)
{
    return bool(p) * (((p - 1) >> lg(q)) + 1);
}

11条指令(无分支,无预测)将结果输入R0:

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

        code for sm_61
                Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                   /* 0x001fc800fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];           /* 0x4c98078000870001 */
        /*0010*/                   IADD R0, RZ, -c[0x0][0x144];    /* 0x4c1100000517ff00 */
        /*0018*/                   LOP.AND R0, R0, c[0x0][0x144];  /* 0x4c47000005170000 */
                                                                   /* 0x003fc400ffa00711 */
        /*0028*/                   FLO.U32 R0, R0;                 /* 0x5c30000000070000 */
        /*0030*/                   MOV R5, c[0x0][0x140];          /* 0x4c98078005070005 */
        /*0038*/                   IADD32I R2, -R0, 0x1f;          /* 0x1d00000001f70002 */
                                                                   /* 0x001fd800fcc007f5 */
        /*0048*/                   IADD32I R0, R5, -0x1;           /* 0x1c0ffffffff70500 */
        /*0050*/                   IADD32I R2, -R2, 0x1f;          /* 0x1d00000001f70202 */
        /*0058*/                   SHR.U32 R0, R0, R2;             /* 0x5c28000000270000 */
                                                                   /* 0x001fd000fe2007f1 */
        /*0068*/                   IADD32I R0, R0, 0x1;            /* 0x1c00000000170000 */
        /*0070*/                   MOV32I R2, 0x0;                 /* 0x010000000007f002 */
        /*0078*/                   MOV32I R3, 0x0;                 /* 0x010000000007f003 */
                                                                   /* 0x001ffc001e2007f2 */
        /*0088*/                   ICMP.NE R0, R0, RZ, R5;         /* 0x5b4b02800ff70000 */
        /*0090*/                   STG.E [R2], R0;                 /* 0xeedc200000070200 */
        /*0098*/                   EXIT;                           /* 0xe30000000007000f */
                                                                   /* 0x001f8000fc0007ff */
        /*00a8*/                   BRA 0xa0;                       /* 0xe2400fffff07000f */
        /*00b0*/                   NOP;                            /* 0x50b0000000070f00 */
        /*00b8*/                   NOP;                            /* 0x50b0000000070f00 */
                ..........................

在研究了上述SASS代码之后,这两条指令似乎很明显:

        /*0038*/                   IADD32I R2, -R0, 0x1f;          /* 0x1d00000001f70002 */
                                                                   /* 0x001fd800fcc007f5 */
        ...
        /*0050*/                   IADD32I R2, -R2, 0x1f;          /* 0x1d00000001f70202 */

不应该是必要的。我没有精确的解释,但我的假设是因为FLO.U32 SASS指令与__ffs()内在函数的语义不完全相同,编译器在使用时显然有一个习惯用法。 intrinsic,包装正在执行工作的基本FLO指令。在C ++源代码级别如何解决这个问题并不是很明显,但是我能够以一种方式使用bfind PTX instruction来进一步减少指令数,根据我的计数(将答案记入登记册):

$ cat t107.cu
#include <cstdio>
#include <cstdint>
__device__ unsigned r = 0;


static __device__ __inline__ uint32_t __my_bfind(uint32_t val){
  uint32_t ret;
  asm volatile("bfind.u32 %0, %1;" : "=r"(ret): "r"(val));
  return ret;}

template <typename T> __device__ T pdqru(T p, T q)
{
    return bool(p) * (((p - 1) >> (__my_bfind(q))) + 1);
}

__global__ void test(unsigned p, unsigned q){
#ifdef USE_DISPLAY
  unsigned q2 = 16;
  unsigned z = 0;
  unsigned l = 1U<<31;
  printf("result %u/%u = %u\n", p, q, pdqru(p, q));
  printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
  printf("result %u/%u = %u\n", p, z, pdqru(p, z));
  printf("result %u/%u = %u\n", z, q, pdqru(z, q));
  printf("result %u/%u = %u\n", l, q, pdqru(l, q));
  printf("result %u/%u = %u\n", q, l, pdqru(q, l));
  printf("result %u/%u = %u\n", l, l, pdqru(l, l));
  printf("result %u/%u = %u\n", q, q, pdqru(q, q));
#else
  r = pdqru(p, q);
#endif
}


int main(){
  unsigned h_r;
  test<<<1,1>>>(32767, 32);
  cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
  printf("result = %u\n", h_r);
}


$ nvcc -arch=sm_61 -o t107 t107.cu -std=c++11
$ cuobjdump -sass t107

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_61

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

        code for sm_61
                Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                        /* 0x001c4400fe0007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                /* 0x4c98078000870001 */
        /*0010*/         {         MOV32I R3, 0x0;                      /* 0x010000000007f003 */
        /*0018*/                   FLO.U32 R2, c[0x0][0x144];        }  /* 0x4c30000005170002 */
                                                                        /* 0x003fd800fec007f6 */
        /*0028*/                   MOV R5, c[0x0][0x140];               /* 0x4c98078005070005 */
        /*0030*/                   IADD32I R0, R5, -0x1;                /* 0x1c0ffffffff70500 */
        /*0038*/                   SHR.U32 R0, R0, R2;                  /* 0x5c28000000270000 */
                                                                        /* 0x001fc800fca007f1 */
        /*0048*/                   IADD32I R0, R0, 0x1;                 /* 0x1c00000000170000 */
        /*0050*/                   MOV32I R2, 0x0;                      /* 0x010000000007f002 */
        /*0058*/                   ICMP.NE R0, R0, RZ, R5;              /* 0x5b4b02800ff70000 */
                                                                        /* 0x001ffc00ffe000f1 */
        /*0068*/                   STG.E [R2], R0;                      /* 0xeedc200000070200 */
        /*0070*/                   EXIT;                                /* 0xe30000000007000f */
        /*0078*/                   BRA 0x78;                            /* 0xe2400fffff87000f */
                ..........................



Fatbin ptx code:
================
arch = sm_61
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$ nvcc -arch=sm_61 -o t107 t107.cu -std=c++11 -DUSE_DISPLAY
$ cuda-memcheck ./t107
========= CUDA-MEMCHECK
result 32767/32 = 1024
result 32767/16 = 2048
result 32767/0 = 1
result 0/32 = 0
result 2147483648/32 = 67108864
result 32/2147483648 = 1
result 2147483648/2147483648 = 1
result 32/32 = 1
result = 0
========= ERROR SUMMARY: 0 errors
$

我只展示了上面的32位示例。

我想我可以说,实际上只有6条指令正在进行&#34;工作&#34;在上面的内核SASS中,其余的指令都是内核&#34;开销&#34;和/或将寄存器结果存储到全局存储器中所需的指令。很明显,编译器仅根据函数生成这些指令:

        /*0018*/                   FLO.U32 R2, c[0x0][0x144];  // find bit set in q
                                                                        /*  */
        /*0028*/                   MOV R5, c[0x0][0x140];      // load p
        /*0030*/                   IADD32I R0, R5, -0x1;       // subtract 1 from p
        /*0038*/                   SHR.U32 R0, R0, R2;         // shift p right by q bit
                                                                        /*  */
        /*0048*/                   IADD32I R0, R0, 0x1;        // add 1 to result
        /*0050*/                   ...                                  /*  */
        /*0058*/                   ICMP.NE R0, R0, RZ, R5;     // account for p=0 case

然而,这与我计算其他案件的方式不一致(他们应该全部减少1)。

答案 4 :(得分:1)

一种可能的简单方法:

$ cat t105.cu
#include <cstdio>

__device__ unsigned r = 0;

template <typename T>
__device__ T pdqru(T p, T q){

  T p1 = p +  (q-1);
  if (sizeof(T) == 8)
    q = __ffsll(q);
  else
    q = __ffs(q);
  return (p1<p)?((p>>(q-1))+1) :(p1 >> (q-1));
}

__global__ void test(unsigned p, unsigned q){
#ifdef USE_DISPLAY
  unsigned q2 = 16;
  unsigned z = 0;
  unsigned l = 1U<<31;
  printf("result %u/%u = %u\n", p, q, pdqru(p, q));
  printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
  printf("result %u/%u = %u\n", p, z, pdqru(p, z));
  printf("result %u/%u = %u\n", z, q, pdqru(z, q));
  printf("result %u/%u = %u\n", l, q, pdqru(l, q));
  printf("result %u/%u = %u\n", q, l, pdqru(q, l));
  printf("result %u/%u = %u\n", l, l, pdqru(l, l));
  printf("result %u/%u = %u\n", q, q, pdqru(q, q));
#else
  r = pdqru(p, q);
#endif
}


int main(){
  unsigned h_r;
  test<<<1,1>>>(32767, 32);
  cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
  printf("result = %u\n", h_r);
}


$ nvcc -arch=sm_61 -o t105 t105.cu
$ cuobjdump -sass ./t105

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_61

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

        code for sm_61
                Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                         /* 0x001fc800fec007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];                 /* 0x4c98078000870001 */
        /*0010*/                   IADD R0, RZ, -c[0x0][0x144];          /* 0x4c1100000517ff00 */
        /*0018*/                   LOP.AND R0, R0, c[0x0][0x144];        /* 0x4c47000005170000 */
                                                                         /* 0x005fd401fe20003d */
        /*0028*/                   FLO.U32 R2, R0;                       /* 0x5c30000000070002 */
        /*0030*/                   MOV R0, c[0x0][0x144];                /* 0x4c98078005170000 */
        /*0038*/                   IADD32I R3, -R2, 0x1f;                /* 0x1d00000001f70203 */
                                                                         /* 0x001fd000fc2007f1 */
        /*0048*/                   IADD32I R0, R0, -0x1;                 /* 0x1c0ffffffff70000 */
        /*0050*/                   MOV R2, c[0x0][0x140];                /* 0x4c98078005070002 */
        /*0058*/                   IADD32I R4, -R3, 0x1f;                /* 0x1d00000001f70304 */
                                                                         /* 0x001fd800fe2007f6 */
        /*0068*/                   IADD R5, R0, c[0x0][0x140];           /* 0x4c10000005070005 */
        /*0070*/                   ISETP.LT.U32.AND P0, PT, R5, R0, PT;  /* 0x5b62038000070507 */
        /*0078*/                   SHR.U32 R0, R2, R4;                   /* 0x5c28000000470200 */
                                                                         /* 0x001fd000fc2007f1 */
        /*0088*/                   IADD32I R0, R0, 0x1;                  /* 0x1c00000000170000 */
        /*0090*/                   MOV32I R2, 0x0;                       /* 0x010000000007f002 */
        /*0098*/                   MOV32I R3, 0x0;                       /* 0x010000000007f003 */
                                                                         /* 0x001ffc001e2007f2 */
        /*00a8*/              @!P0 SHR.U32 R0, R5, R4;                   /* 0x5c28000000480500 */
        /*00b0*/                   STG.E [R2], R0;                       /* 0xeedc200000070200 */
        /*00b8*/                   EXIT;                                 /* 0xe30000000007000f */
                                                                         /* 0x001f8000fc0007ff */
        /*00c8*/                   BRA 0xc0;                             /* 0xe2400fffff07000f */
        /*00d0*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00d8*/                   NOP;                                  /* 0x50b0000000070f00 */
                                                                         /* 0x001f8000fc0007e0 */
        /*00e8*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00f0*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00f8*/                   NOP;                                  /* 0x50b0000000070f00 */
                ..........................



Fatbin ptx code:
================
arch = sm_61
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$ nvcc -arch=sm_61 -o t105 t105.cu -DUSE_DISPLAY
$ cuda-memcheck ./t105
========= CUDA-MEMCHECK
result 32767/32 = 1024
result 32767/16 = 2048
result 32767/0 = 2048
result 0/32 = 0
result 2147483648/32 = 67108864
result 32/2147483648 = 1
result 2147483648/2147483648 = 1
result 32/32 = 1
result = 0
========= ERROR SUMMARY: 0 errors
$

对于32位情况,大约有14条SASS指令,以便将答案输入R0。它会为除零情况产生虚假结果。

this answer案例的等效程序集如下所示:

$ cat t106.cu
#include <cstdio>
#include <cstdint>
__device__ unsigned r = 0;


template <typename T> __device__ int find_first_set(T x);
template <> __device__ int find_first_set<uint32_t>(uint32_t x) { return __ffs(x);   }
template <> __device__ int find_first_set<uint64_t>(uint64_t x) { return __ffsll(x); }

template <typename T>  __device__ T lg(T x) { return find_first_set(x) - 1; }

template <typename T>
__device__ T pdqru(T dividend, T divisor)
{
    auto log_2_of_divisor = lg(divisor);
    auto mask = divisor - 1;
    auto correction_for_rounding_up = ((dividend & mask) + mask) >> log_2_of_divisor;

    return (dividend >> log_2_of_divisor) + correction_for_rounding_up;
}

__global__ void test(unsigned p, unsigned q){
#ifdef USE_DISPLAY
  unsigned q2 = 16;
  unsigned z = 0;
  unsigned l = 1U<<31;
  printf("result %u/%u = %u\n", p, q, pdqru(p, q));
  printf("result %u/%u = %u\n", p, q2, pdqru(p, q2));
  printf("result %u/%u = %u\n", p, z, pdqru(p, z));
  printf("result %u/%u = %u\n", z, q, pdqru(z, q));
  printf("result %u/%u = %u\n", l, q, pdqru(l, q));
  printf("result %u/%u = %u\n", q, l, pdqru(q, l));
  printf("result %u/%u = %u\n", l, l, pdqru(l, l));
  printf("result %u/%u = %u\n", q, q, pdqru(q, q));
#else
  r = pdqru(p, q);
#endif
}


int main(){
  unsigned h_r;
  test<<<1,1>>>(32767, 32);
  cudaMemcpyFromSymbol(&h_r, r, sizeof(unsigned));
  printf("result = %u\n", h_r);
}


$ nvcc -std=c++11  -arch=sm_61 -o t106 t106.cu
$ cuobjdump -sass t106

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_61

Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

        code for sm_61
                Function : _Z4testjj
        .headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                   /* 0x001fd400fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20];           /* 0x4c98078000870001 */
        /*0010*/                   IADD R0, RZ, -c[0x0][0x144];    /* 0x4c1100000517ff00 */
        /*0018*/                   MOV R2, c[0x0][0x144];          /* 0x4c98078005170002 */
                                                                   /* 0x003fc40007a007f2 */
        /*0028*/                   LOP.AND R0, R0, c[0x0][0x144];  /* 0x4c47000005170000 */
        /*0030*/                   FLO.U32 R3, R0;                 /* 0x5c30000000070003 */
        /*0038*/                   IADD32I R0, R2, -0x1;           /* 0x1c0ffffffff70200 */
                                                                   /* 0x001fc400fcc017f5 */
        /*0048*/                   IADD32I R3, -R3, 0x1f;          /* 0x1d00000001f70303 */
        /*0050*/                   LOP.AND R2, R0, c[0x0][0x140];  /* 0x4c47000005070002 */
        /*0058*/                   IADD R2, R0, R2;                /* 0x5c10000000270002 */
                                                                   /* 0x001fd000fe2007f1 */
        /*0068*/                   IADD32I R0, -R3, 0x1f;          /* 0x1d00000001f70300 */
        /*0070*/                   MOV R3, c[0x0][0x140];          /* 0x4c98078005070003 */
        /*0078*/                   MOV32I R6, 0x0;                 /* 0x010000000007f006 */
                                                                   /* 0x001fc400fc2407f1 */
        /*0088*/                   SHR.U32 R4, R2, R0.reuse;       /* 0x5c28000000070204 */
        /*0090*/                   SHR.U32 R5, R3, R0;             /* 0x5c28000000070305 */
        /*0098*/                   MOV R2, R6;                     /* 0x5c98078000670002 */
                                                                   /* 0x0003c400fe4007f4 */
        /*00a8*/                   MOV32I R3, 0x0;                 /* 0x010000000007f003 */
        /*00b0*/                   IADD R0, R4, R5;                /* 0x5c10000000570400 */
        /*00b8*/                   STG.E [R2], R0;                 /* 0xeedc200000070200 */
                                                                   /* 0x001f8000ffe007ff */
        /*00c8*/                   EXIT;                           /* 0xe30000000007000f */
        /*00d0*/                   BRA 0xd0;                       /* 0xe2400fffff87000f */
        /*00d8*/                   NOP;                            /* 0x50b0000000070f00 */
                                                                   /* 0x001f8000fc0007e0 */
        /*00e8*/                   NOP;                            /* 0x50b0000000070f00 */
        /*00f0*/                   NOP;                            /* 0x50b0000000070f00 */
        /*00f8*/                   NOP;                            /* 0x50b0000000070f00 */
                ..........................



Fatbin ptx code:
================
arch = sm_61
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$
根据我的统计,

似乎更长了1个指令。

答案 5 :(得分:1)

以下是通过人口计数的替代解决方案。我只尝试了32位变体,对参考实现进行了详尽的测试。由于除数q是2的幂,我们可以在种群计数操作的帮助下轻松确定移位计数s。截断除法的余数t可以通过直接从除数m派生的简单掩码q来计算。

// For p in [0,0xffffffff], q = (1 << s) with s in [0,31], compute ceil(p/q)
__device__ uint32_t reference (uint32_t p, uint32_t q)
{
    uint32_t r = p / q;
    if ((q * r) < p) r++;
    return r;
}

// For p in [0,0xffffffff], q = (1 << s) with s in [0,31], compute ceil(p/q)
__device__ uint32_t solution (uint32_t p, uint32_t q)
{
    uint32_t r, s, t, m;
    m = q - 1;
    s = __popc (m);
    r = p >> s;
    t = p & m;
    if (t > 0) r++;
    return r;
}

solution()是否比之前发布的代码更快可能取决于特定的GPU架构。使用CUDA 8.0,它编译为以下PTX指令序列:

add.s32         %r3, %r2, -1;
popc.b32        %r4, %r3;
shr.u32         %r5, %r1, %r4;
and.b32         %r6, %r3, %r1;
setp.ne.s32     %p1, %r6, 0;
selp.u32        %r7, 1, 0, %p1;
add.s32         %r8, %r5, %r7;

对于sm_5x,除了将两个指令SETPSELP缩小为单个ICMP之外,这几乎转换为1:1的机器代码,因为比较为0