第一个代码:
.file "CountChar.cpp"
.text
.align 2
.globl _ZN2JQ7cString9CountCharEPKc
.type _ZN2JQ7cString9CountCharEPKc, @function
_ZN2JQ7cString9CountCharEPKc:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
#APP
# 21 "CountChar.cpp" 1
movq %rdi,%rbx
testq %rbx,%rbx
jne .JQcStringCountChar1Start
movq $-1,%rax
jmp .JQcStringCountChar1End
.JQcStringCountChar1Start:xor %rdx,%rdx
.JQcStringCountChar1Lo0:mov (%rbx),%cl
test %cl,%cl
jne .JQcStringCountChar1Lo1
mov %rdx,%rax
jmp .JQcStringCountChar1End
.JQcStringCountChar1Lo1:js .JQcStringCountChar1Lo2
inc %rbx
jmp .JQcStringCountChar1LoEnd
.JQcStringCountChar1Lo2:movzbw %cl,%ax
and $224,%ax
cmp $192,%ax
jne .JQcStringCountChar1Lo3
add $2,%rbx
jmp .JQcStringCountChar1LoEnd
.JQcStringCountChar1Lo3:movzbw (%rbx),%ax
and $240,%ax
cmp $224,%ax
jne .JQcStringCountChar1Lo4
add $3,%rbx
jmp .JQcStringCountChar1LoEnd
.JQcStringCountChar1Lo4:movzbw (%rbx),%ax
and $248,%ax
cmp $240,%ax
jne .JQcStringCountChar1Lo5
add $4,%rbx
jmp .JQcStringCountChar1LoEnd
.JQcStringCountChar1Lo5:movzbw (%rbx),%ax
and $252,%ax
cmp $248,%ax
jne .JQcStringCountChar1Lo6
add $5,%rbx
jmp .JQcStringCountChar1LoEnd
.JQcStringCountChar1Lo6:movzbw (%rbx),%ax
and $254,%ax
cmp $252,%ax
jne .JQcStringCountChar1LoError
add $6,%rbx
jmp .JQcStringCountChar1LoEnd
.JQcStringCountChar1LoError:mov $-2,%rax
jmp .JQcStringCountChar1End
.JQcStringCountChar1LoEnd:inc %rdx
jmp .JQcStringCountChar1Lo0
.JQcStringCountChar1End:
# 0 "" 2
#NO_APP
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _ZN2JQ7cString9CountCharEPKc, .-_ZN2JQ7cString9CountCharEPKc
.ident "GCC: (Gentoo 4.9.3 p1.5, pie-0.6.4) 4.9.3"
.section .note.GNU-stack,"",@progbits
执行10000000次:
real 0m6.293s
user 0m6.196s
sys 0m0.005s
第二代码
.file "CountChar.cpp"
.text
.align 2
.globl _ZN2JQ7cString9CountCharEPKc
.type _ZN2JQ7cString9CountCharEPKc, @function
_ZN2JQ7cString9CountCharEPKc:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %rdi, %rax
testq %rax, %rax
jne .L2
movq $-1, %rax
jmp .L3
.L2:
movq %rax, %rbx
movl $0, %r12d
jmp .L4
.L11:
movzbl (%rbx), %eax
testb %al, %al
js .L5
addq $1, %rbx
jmp .L6
.L5:
movzbl (%rbx), %eax
movzbl %al, %eax
andl $224, %eax
cmpl $192, %eax
jne .L7
addq $2, %rbx
jmp .L6
.L7:
movzbl (%rbx), %eax
movzbl %al, %eax
andl $240, %eax
cmpl $224, %eax
jne .L8
addq $3, %rbx
jmp .L6
.L8:
movzbl (%rbx), %eax
movzbl %al, %eax
andl $248, %eax
cmpl $240, %eax
jne .L9
addq $4, %rbx
jmp .L6
.L9:
movzbl (%rbx), %eax
movzbl %al, %eax
andl $252, %eax
cmpl $248, %eax
jne .L10
addq $5, %rbx
jmp .L6
.L10:
movzbl (%rbx), %eax
movzbl %al, %eax
andl $254, %eax
cmpl $252, %eax
jne .L6
addq $6, %rbx
.L6:
addq $1, %r12
.L4:
movzbl (%rbx), %eax
testb %al, %al
jne .L11
movq %r12, %rax
.L3:
popq %rbx
popq %r12
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _ZN2JQ7cString9CountCharEPKc, .-_ZN2JQ7cString9CountCharEPKc
.ident "GCC: (Gentoo 4.9.3 p1.5, pie-0.6.4) 4.9.3"
.section .note.GNU-stack,"",@progbits
执行10000000次:
real 0m5.326s
user 0m5.176s
sys 0m0.002s
答案 0 :(得分:1)
不看细节:指令计数不是一个好的测量。
每条指令都有不同的CPU周期,所以简单地计算指令就不够了;这就像在停车场里计算汽车来获得它们的总价格一样。
答案 1 :(得分:0)
以下面的代码为例:
mov eax,1 //1 cycle
@loop:
inc eax //1 cycle, does not update CF
adc ecx,ebx //2 cycles + 1 cycle because of partial register write in EFLAGS
imul ecx,ebx //3 cycles
这段代码需要8个周期,所有这些周期都在等待,因为它们形成了一个长的依赖链。
以下代码
mov eax,1 //1 cycle
xor edx,edx //0 cycles, pairs
@loop:
setc dl //1 cycle edx = carry
add ecx,ebx //0 cycles, pairs
lea eax,[eax+1] //0 cycles, pairs, does not alter flags
lea ecx,[ecx+edx] //1 cycle
imul ecx,ebx //3 cycles
此代码需要6个周期,即使它是更多指令。
这是一个愚蠢的例子,将ADC
分解为setc + add
并没有帮助我们。即使它是更多的指令它仍然运行得更快,因为我已经分解了一些依赖链并使用了不改变标志的指令。