如何找到__builtin_popcountll的确切实现?

时间:2016-06-27 18:13:41

标签: c++ simd icc

gcc编译器很旧(4.1.2)。我安装了2016版的intel c编译器(icc)。我正在编译一个使用icc的“__builtin_popcountll”的程序。

来自https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36041的讨论 我们在更高版本的gcc中更快地实现了builtin_popcountll。我想知道是否使用了这些更快的实现。

如何找出所使用的“__builtin_popcountll”的确切实现?

我不能使用SSE 4.2的popcnt指令。 cpu只支持sse 4.1。

cat / proc / cpuinfo的输出:

processor   : 7
vendor_id   : GenuineIntel
cpu family  : 6
model       : 23
model name  : Intel(R) Xeon(R) CPU           E5420  @ 2.50GHz
stepping    : 10
cpu MHz     : 2493.752
cache size  : 6144 KB
physical id : 1
siblings    : 4
core id     : 3
cpu cores   : 4
fpu     : yes
fpu_exception   : yes
cpuid level : 13
wp      : yes
flags       : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm syscall nx lm constant_tsc pni monitor ds_cpl vmx est tm2 cx16 xtpr lahf_lm
bogomips    : 4987.85
clflush size    : 64
cache_alignment : 64
address sizes   : 38 bits physical, 48 bits virtual

谢谢。

更新

来自icc的asm输出

# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
# mark_description "-S -fsource-asm -c";
    .file "small.cpp"
    .text
..TXTST0:
# -- Begin  main
    .text
# mark_begin;
       .align    16,0x90
    .globl main
# --- main()
main:
..B1.1:                         # Preds ..B1.0

### void main(void){

    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
..___tag_value_main.1:
..L2:
                                                          #9.16
        pushq     %rbp                                          #9.16
    .cfi_def_cfa_offset 16
        movq      %rsp, %rbp                                    #9.16
    .cfi_def_cfa 6, 16
    .cfi_offset 6, -16
        andq      $-128, %rsp                                   #9.16
        subq      $128, %rsp                                    #9.16
        xorl      %esi, %esi                                    #9.16
        movl      $3, %edi                                      #9.16
        call      __intel_new_feature_proc_init                 #9.16
                                # LOE rbx r12 r13 r14 r15
..B1.4:                         # Preds ..B1.1
        stmxcsr   (%rsp)                                        #9.16

###     int i = 1;
###     test(i);
### }

        xorl      %eax, %eax                                    #12.1
        orl       $32832, (%rsp)                                #9.16
        ldmxcsr   (%rsp)                                        #9.16
        movq      %rbp, %rsp                                    #12.1
        popq      %rbp                                          #12.1
    .cfi_def_cfa 7, 8
    .cfi_restore 6
        ret                                                     #12.1
        .align    16,0x90
    .cfi_endproc
                                # LOE
# mark_end;
    .type   main,@function
    .size   main,.-main
    .data
# -- End  main
    .text
# -- Begin  _Z4testi
    .text
# mark_begin;
       .align    16,0x90
    .globl _Z4testi
# --- test(int)
_Z4testi:
# parameter 1: %edi
..B2.1:                         # Preds ..B2.0

### int test(int i){

    .cfi_startproc
    .cfi_personality 0x3,__gxx_personality_v0
..___tag_value__Z4testi.9:
..L10:
                                                         #6.16

###       return __builtin_popcountll(i); //somehow it says _mm_popcnt_u32 is illegal instruction 

        movq      $0x5555555555555555, %rax                     #7.14
        movslq    %edi, %rdi                                    #7.14
        movq      $0x3333333333333333, %rcx                     #7.14
        movq      %rdi, %rdx                                    #7.14
        movq      $0xf0f0f0f0f0f0f0f, %r8                       #7.14
        shrq      $1, %rdx                                      #7.14
        movq      $0x101010101010101, %r9                       #7.14
        andq      %rax, %rdx                                    #7.14
        subq      %rdx, %rdi                                    #7.14
        movq      %rdi, %rax                                    #7.14
        andq      %rcx, %rdi                                    #7.14
        shrq      $2, %rax                                      #7.14
        andq      %rcx, %rax                                    #7.14
        addq      %rdi, %rax                                    #7.14
        movq      %rax, %rsi                                    #7.14
        shrq      $4, %rsi                                      #7.14
        addq      %rsi, %rax                                    #7.14
        andq      %r8, %rax                                     #7.14
        imulq     %r9, %rax                                     #7.14
        shrq      $56, %rax                                     #7.14
        ret                                                     #7.14
        .align    16,0x90
    .cfi_endproc
                                # LOE
# mark_end;
    .type   _Z4testi,@function
    .size   _Z4testi,.-_Z4testi
    .data
# -- End  _Z4testi
    .data
    .section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
    .section .eh_frame,"a",@progbits
.eh_frame_seg:
    .align 8
# End

x86 icc 13.0.1的gcc godbolt

// Type your code here, or load an example.
int square(int num) {
  return __builtin_popcountll(num);
}

L__routine_start__Z6squarei_0:
square(int):
        pushq     %rbp                                          #2.21
        movq      %rsp, %rbp                                    #2.21
        subq      $16, %rsp                                     #2.21
        movl      %edi, -8(%rbp)                                #2.21
        movl      -8(%rbp), %eax                                #3.10
        movslq    %eax, %rax                                    #3.10
        movq      %rax, %rdx                                    #3.10
        shrq      $1, %rdx                                      #3.10
        movq      $0x5555555555555555, %rcx                     #3.10
        andq      %rcx, %rdx                                    #3.10
        subq      %rdx, %rax                                    #3.10
        movq      %rax, %rdx                                    #3.10
        shrq      $2, %rdx                                      #3.10
        movq      $0x3333333333333333, %rcx                     #3.10
        andq      %rcx, %rdx                                    #3.10
        andq      %rcx, %rax                                    #3.10
        addq      %rax, %rdx                                    #3.10
        movq      %rdx, %rax                                    #3.10
        shrq      $4, %rax                                      #3.10
        addq      %rax, %rdx                                    #3.10
        movq      $0xf0f0f0f0f0f0f0f, %rax                      #3.10
        andq      %rax, %rdx                                    #3.10
        movq      %rdx, %rax                                    #3.10
        shrq      $8, %rax                                      #3.10
        addq      %rax, %rdx                                    #3.10
        movq      %rdx, %rax                                    #3.10
        shrq      $16, %rax                                     #3.10
        addq      %rax, %rdx                                    #3.10
        movq      %rdx, %rax                                    #3.10
        shrq      $32, %rax                                     #3.10
        addq      %rax, %rdx                                    #3.10
        andq      $255, %rdx                                    #3.10
        movl      %edx, -16(%rbp)                               #3.10
        movl      -16(%rbp), %eax                               #3.10
        leave                                                   #3.10
        ret                                                     #3.10

x86 gcc 6的gcc godbolt

square(int):
        pushq   %rbp
        movq    %rsp, %rbp
        subq    $16, %rsp
        movl    %edi, -4(%rbp)
        movl    -4(%rbp), %eax
        cltq
        movq    %rax, %rdi
        call    __popcountdi2
        leave
        ret

看不出__popcountdi2是什么。但我想我只是测试实现中的构建速度和gcc.gnu.org中的构建速度,看看哪一个更快。

0 个答案:

没有答案