gcc编译器很旧(4.1.2)。我安装了2016版的intel c编译器(icc)。我正在编译一个使用icc的“__builtin_popcountll”的程序。
来自https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36041的讨论 我们在更高版本的gcc中更快地实现了builtin_popcountll。我想知道是否使用了这些更快的实现。
如何找出所使用的“__builtin_popcountll”的确切实现?
我不能使用SSE 4.2的popcnt指令。 cpu只支持sse 4.1。
cat / proc / cpuinfo的输出:
processor : 7
vendor_id : GenuineIntel
cpu family : 6
model : 23
model name : Intel(R) Xeon(R) CPU E5420 @ 2.50GHz
stepping : 10
cpu MHz : 2493.752
cache size : 6144 KB
physical id : 1
siblings : 4
core id : 3
cpu cores : 4
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm syscall nx lm constant_tsc pni monitor ds_cpl vmx est tm2 cx16 xtpr lahf_lm
bogomips : 4987.85
clflush size : 64
cache_alignment : 64
address sizes : 38 bits physical, 48 bits virtual
谢谢。
更新
来自icc的asm输出
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 16.0.3.210 Build 20160415";
# mark_description "-S -fsource-asm -c";
.file "small.cpp"
.text
..TXTST0:
# -- Begin main
.text
# mark_begin;
.align 16,0x90
.globl main
# --- main()
main:
..B1.1: # Preds ..B1.0
### void main(void){
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
..___tag_value_main.1:
..L2:
#9.16
pushq %rbp #9.16
.cfi_def_cfa_offset 16
movq %rsp, %rbp #9.16
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-128, %rsp #9.16
subq $128, %rsp #9.16
xorl %esi, %esi #9.16
movl $3, %edi #9.16
call __intel_new_feature_proc_init #9.16
# LOE rbx r12 r13 r14 r15
..B1.4: # Preds ..B1.1
stmxcsr (%rsp) #9.16
### int i = 1;
### test(i);
### }
xorl %eax, %eax #12.1
orl $32832, (%rsp) #9.16
ldmxcsr (%rsp) #9.16
movq %rbp, %rsp #12.1
popq %rbp #12.1
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #12.1
.align 16,0x90
.cfi_endproc
# LOE
# mark_end;
.type main,@function
.size main,.-main
.data
# -- End main
.text
# -- Begin _Z4testi
.text
# mark_begin;
.align 16,0x90
.globl _Z4testi
# --- test(int)
_Z4testi:
# parameter 1: %edi
..B2.1: # Preds ..B2.0
### int test(int i){
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
..___tag_value__Z4testi.9:
..L10:
#6.16
### return __builtin_popcountll(i); //somehow it says _mm_popcnt_u32 is illegal instruction
movq $0x5555555555555555, %rax #7.14
movslq %edi, %rdi #7.14
movq $0x3333333333333333, %rcx #7.14
movq %rdi, %rdx #7.14
movq $0xf0f0f0f0f0f0f0f, %r8 #7.14
shrq $1, %rdx #7.14
movq $0x101010101010101, %r9 #7.14
andq %rax, %rdx #7.14
subq %rdx, %rdi #7.14
movq %rdi, %rax #7.14
andq %rcx, %rdi #7.14
shrq $2, %rax #7.14
andq %rcx, %rax #7.14
addq %rdi, %rax #7.14
movq %rax, %rsi #7.14
shrq $4, %rsi #7.14
addq %rsi, %rax #7.14
andq %r8, %rax #7.14
imulq %r9, %rax #7.14
shrq $56, %rax #7.14
ret #7.14
.align 16,0x90
.cfi_endproc
# LOE
# mark_end;
.type _Z4testi,@function
.size _Z4testi,.-_Z4testi
.data
# -- End _Z4testi
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 8
# End
x86 icc 13.0.1的gcc godbolt
// Type your code here, or load an example.
int square(int num) {
return __builtin_popcountll(num);
}
L__routine_start__Z6squarei_0:
square(int):
pushq %rbp #2.21
movq %rsp, %rbp #2.21
subq $16, %rsp #2.21
movl %edi, -8(%rbp) #2.21
movl -8(%rbp), %eax #3.10
movslq %eax, %rax #3.10
movq %rax, %rdx #3.10
shrq $1, %rdx #3.10
movq $0x5555555555555555, %rcx #3.10
andq %rcx, %rdx #3.10
subq %rdx, %rax #3.10
movq %rax, %rdx #3.10
shrq $2, %rdx #3.10
movq $0x3333333333333333, %rcx #3.10
andq %rcx, %rdx #3.10
andq %rcx, %rax #3.10
addq %rax, %rdx #3.10
movq %rdx, %rax #3.10
shrq $4, %rax #3.10
addq %rax, %rdx #3.10
movq $0xf0f0f0f0f0f0f0f, %rax #3.10
andq %rax, %rdx #3.10
movq %rdx, %rax #3.10
shrq $8, %rax #3.10
addq %rax, %rdx #3.10
movq %rdx, %rax #3.10
shrq $16, %rax #3.10
addq %rax, %rdx #3.10
movq %rdx, %rax #3.10
shrq $32, %rax #3.10
addq %rax, %rdx #3.10
andq $255, %rdx #3.10
movl %edx, -16(%rbp) #3.10
movl -16(%rbp), %eax #3.10
leave #3.10
ret #3.10
x86 gcc 6的gcc godbolt
square(int):
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
cltq
movq %rax, %rdi
call __popcountdi2
leave
ret
看不出__popcountdi2是什么。但我想我只是测试实现中的构建速度和gcc.gnu.org中的构建速度,看看哪一个更快。