Question

第一个代码：

    .file   "CountChar.cpp"
    .text
    .align 2
    .globl  _ZN2JQ7cString9CountCharEPKc
    .type   _ZN2JQ7cString9CountCharEPKc, @function
_ZN2JQ7cString9CountCharEPKc:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
#APP
# 21 "CountChar.cpp" 1
    movq %rdi,%rbx
    testq %rbx,%rbx
    jne .JQcStringCountChar1Start
    movq $-1,%rax
    jmp .JQcStringCountChar1End
    .JQcStringCountChar1Start:xor %rdx,%rdx
    .JQcStringCountChar1Lo0:mov (%rbx),%cl
    test %cl,%cl
    jne .JQcStringCountChar1Lo1
    mov %rdx,%rax
    jmp .JQcStringCountChar1End
    .JQcStringCountChar1Lo1:js .JQcStringCountChar1Lo2
    inc %rbx
    jmp .JQcStringCountChar1LoEnd
    .JQcStringCountChar1Lo2:movzbw %cl,%ax
    and $224,%ax
    cmp $192,%ax
    jne .JQcStringCountChar1Lo3
    add $2,%rbx
    jmp .JQcStringCountChar1LoEnd
    .JQcStringCountChar1Lo3:movzbw (%rbx),%ax
    and $240,%ax
    cmp $224,%ax
    jne .JQcStringCountChar1Lo4
    add $3,%rbx
    jmp .JQcStringCountChar1LoEnd
    .JQcStringCountChar1Lo4:movzbw (%rbx),%ax
    and $248,%ax
    cmp $240,%ax
    jne .JQcStringCountChar1Lo5
    add $4,%rbx
    jmp .JQcStringCountChar1LoEnd
    .JQcStringCountChar1Lo5:movzbw (%rbx),%ax
    and $252,%ax
    cmp $248,%ax
    jne .JQcStringCountChar1Lo6
    add $5,%rbx
    jmp .JQcStringCountChar1LoEnd
    .JQcStringCountChar1Lo6:movzbw (%rbx),%ax
    and $254,%ax
    cmp $252,%ax
    jne .JQcStringCountChar1LoError
    add $6,%rbx
    jmp .JQcStringCountChar1LoEnd
    .JQcStringCountChar1LoError:mov $-2,%rax
    jmp .JQcStringCountChar1End
    .JQcStringCountChar1LoEnd:inc %rdx
    jmp .JQcStringCountChar1Lo0
    .JQcStringCountChar1End:
# 0 "" 2
#NO_APP
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   _ZN2JQ7cString9CountCharEPKc, .-_ZN2JQ7cString9CountCharEPKc
    .ident  "GCC: (Gentoo 4.9.3 p1.5, pie-0.6.4) 4.9.3"
    .section    .note.GNU-stack,"",@progbits

执行10000000次：

real    0m6.293s
user    0m6.196s
sys 0m0.005s

第二代码

    .file   "CountChar.cpp"
        .text
        .align 2
        .globl  _ZN2JQ7cString9CountCharEPKc
        .type   _ZN2JQ7cString9CountCharEPKc, @function
    _ZN2JQ7cString9CountCharEPKc:
    .LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        pushq   %r12
        pushq   %rbx
        .cfi_offset 12, -24
        .cfi_offset 3, -32
        movq    %rdi, %rax
        testq   %rax, %rax
        jne .L2
        movq    $-1, %rax
        jmp .L3
    .L2:
        movq    %rax, %rbx
        movl    $0, %r12d
        jmp .L4
    .L11:
        movzbl  (%rbx), %eax
        testb   %al, %al
        js  .L5
        addq    $1, %rbx
        jmp .L6
    .L5:
        movzbl  (%rbx), %eax
        movzbl  %al, %eax
        andl    $224, %eax
        cmpl    $192, %eax
        jne .L7
        addq    $2, %rbx
        jmp .L6
    .L7:
        movzbl  (%rbx), %eax
        movzbl  %al, %eax
        andl    $240, %eax
        cmpl    $224, %eax
        jne .L8
        addq    $3, %rbx
        jmp .L6
    .L8:
        movzbl  (%rbx), %eax
        movzbl  %al, %eax
        andl    $248, %eax
        cmpl    $240, %eax
        jne .L9
        addq    $4, %rbx
        jmp .L6
    .L9:
        movzbl  (%rbx), %eax
        movzbl  %al, %eax
        andl    $252, %eax
        cmpl    $248, %eax
        jne .L10
        addq    $5, %rbx
        jmp .L6
    .L10:
        movzbl  (%rbx), %eax
        movzbl  %al, %eax
        andl    $254, %eax
        cmpl    $252, %eax
        jne .L6
        addq    $6, %rbx
    .L6:
        addq    $1, %r12
    .L4:
        movzbl  (%rbx), %eax
        testb   %al, %al
        jne .L11
        movq    %r12, %rax
    .L3:
        popq    %rbx
        popq    %r12
        popq    %rbp
        .cfi_def_cfa 7, 8
        ret
        .cfi_endproc
    .LFE0:
        .size   _ZN2JQ7cString9CountCharEPKc, .-_ZN2JQ7cString9CountCharEPKc
        .ident  "GCC: (Gentoo 4.9.3 p1.5, pie-0.6.4) 4.9.3"
        .section    .note.GNU-stack,"",@progbits

执行10000000次：

real    0m5.326s
user    0m5.176s
sys 0m0.002s

Answer 1

不看细节：指令计数不是一个好的测量。

每条指令都有不同的CPU周期，所以简单地计算指令就不够了;这就像在停车场里计算汽车来获得它们的总价格一样。

Answer 2

很久以前，CPU已停止变得简单了计算指令数量是完全没有意义的指标您需要考虑到现代CPU是superscalar并执行Out of Order（OoO）这一事实。
此外，您需要知道number of cycles that an instruction typically拍摄的内容一个更好的指标是longest dependency chain and the number of instructions that the CPU can execute in parallel的长度。

以下面的代码为例：

mov eax,1      //1 cycle
@loop:
inc eax        //1 cycle, does not update CF
adc ecx,ebx    //2 cycles + 1 cycle because of partial register write in EFLAGS
imul ecx,ebx   //3 cycles

这段代码需要8个周期，所有这些周期都在等待，因为它们形成了一个长的依赖链。

以下代码

mov eax,1             //1 cycle
xor edx,edx           //0 cycles, pairs
@loop:
setc dl               //1 cycle edx = carry
add ecx,ebx           //0 cycles, pairs
lea eax,[eax+1]       //0 cycles, pairs, does not alter flags
lea ecx,[ecx+edx]     //1 cycle
imul ecx,ebx          //3 cycles

此代码需要6个周期，即使它是更多指令。这是一个愚蠢的例子，将ADC分解为setc + add并没有帮助我们。即使它是更多的指令它仍然运行得更快，因为我已经分解了一些依赖链并使用了不改变标志的指令。

Asm，为什么第一个代码慢于第二个代码而第一个代码使用较少的指令呢？

2 个答案: